[llvm] a70bbb5 - [AMDGPU] Simplify 64-bit division/remainder expansion

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 12 07:50:55 PST 2021


Author: Jay Foad
Date: 2021-11-12T15:48:41Z
New Revision: a70bbb5f7af051a704a985402ff11a38a5f78984

URL: https://github.com/llvm/llvm-project/commit/a70bbb5f7af051a704a985402ff11a38a5f78984
DIFF: https://github.com/llvm/llvm-project/commit/a70bbb5f7af051a704a985402ff11a38a5f78984.diff

LOG: [AMDGPU] Simplify 64-bit division/remainder expansion

The old expansion open-coded a 64-bit addition in a strange way, by
adding the high parts *without* carry-in from the low part, and then
adding the carry back in later on. Fixing this saves a couple of
instructions and makes the code much easier to understand.

Differential Revision: https://reviews.llvm.org/D113679

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
    llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
    llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
    llvm/test/CodeGen/AMDGPU/bypass-div.ll
    llvm/test/CodeGen/AMDGPU/carryout-selection.ll
    llvm/test/CodeGen/AMDGPU/sdiv64.ll
    llvm/test/CodeGen/AMDGPU/srem64.ll
    llvm/test/CodeGen/AMDGPU/udiv64.ll
    llvm/test/CodeGen/AMDGPU/urem64.ll
    llvm/test/CodeGen/AMDGPU/wave32.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2d42e7977365..5890f84ef198 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1839,6 +1839,9 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
   }
 
   if (isTypeLegal(MVT::i64)) {
+    // The algorithm here is based on ideas from "Software Integer Division",
+    // Tom Rodeheffer, August 2008.
+
     MachineFunction &MF = DAG.getMachineFunction();
     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -1873,37 +1876,35 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
     SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
     SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
 
+    // First round of UNR (Unsigned integer Newton-Raphson).
     SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
     SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
     SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
     SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
                                     Zero);
-    SDValue Mulhi1_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
-                                    One);
-
+    SDValue Mulhi1_Hi =
+        DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
     SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
                                   Mulhi1_Lo, Zero1);
     SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
                                   Mulhi1_Hi, Add1_Lo.getValue(1));
-    SDValue Add1_HiNc = DAG.getNode(ISD::ADD, DL, HalfVT, Rcp_Hi, Mulhi1_Hi);
     SDValue Add1 = DAG.getBitcast(VT,
                         DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
 
+    // Second round of UNR.
     SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
     SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
     SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
                                     Zero);
-    SDValue Mulhi2_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
-                                    One);
-
+    SDValue Mulhi2_Hi =
+        DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
     SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
                                   Mulhi2_Lo, Zero1);
-    SDValue Add2_HiC = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_HiNc,
-                                   Mulhi2_Hi, Add1_Lo.getValue(1));
-    SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add2_HiC,
-                                  Zero, Add2_Lo.getValue(1));
+    SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
+                                  Mulhi2_Hi, Add2_Lo.getValue(1));
     SDValue Add2 = DAG.getBitcast(VT,
                         DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
+
     SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
 
     SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);

diff  --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 6251a193393b..1f898f2ba8b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3051,7 +3051,6 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
 
   auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
   auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
-  auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
   auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
 
   auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
@@ -3062,9 +3061,7 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B,
 
   auto Zero32 = B.buildConstant(S32, 0);
   auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
-  auto Add2_HiC =
-      B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
-  auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
+  auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
   auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
 
   auto UnmergeNumer = B.buildUnmerge(S32, Numer);

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
index d2a68c2f0488..3b969e94d5ba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-sdiv.mir
@@ -415,72 +415,70 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32)
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32)
     ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
     ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -494,8 +492,8 @@ body: |
     ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX6-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]]
-    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]]
-    ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]]
+    ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]]
     ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]]
@@ -504,8 +502,8 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX6-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]]
-    ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]]
-    ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]]
+    ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]]
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
@@ -587,72 +585,70 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32)
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32)
     ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -666,8 +662,8 @@ body: |
     ; GFX8-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX8-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]]
-    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]]
-    ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]]
+    ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]]
     ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]]
@@ -676,8 +672,8 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX8-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]]
-    ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]]
-    ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]]
+    ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]]
     ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
@@ -759,72 +755,70 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32)
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32)
     ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
     ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -838,8 +832,8 @@ body: |
     ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]]
-    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]]
-    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]]
+    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]]
     ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]]
@@ -848,8 +842,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]]
-    ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]]
-    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]]
+    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]]
     ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
@@ -945,72 +939,70 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]]
     ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32)
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32)
     ; GFX6-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]]
     ; GFX6-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]]
     ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -1024,8 +1016,8 @@ body: |
     ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX6-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV26]]
-    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV27]], [[UADDO39]]
-    ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV27]], [[UADDO39]]
+    ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV25]]
     ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV24]]
@@ -1034,8 +1026,8 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX6-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV28]]
-    ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV29]], [[UADDO41]]
-    ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV29]], [[UADDO41]]
+    ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]]
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
@@ -1052,13 +1044,13 @@ body: |
     ; GFX6-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX6-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64)
     ; GFX6-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]]
-    ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]]
-    ; GFX6-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE14]](s32)
+    ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]]
+    ; GFX6-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE12]](s32)
     ; GFX6-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX6-NEXT: [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64)
     ; GFX6-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UV38]], [[UV40]]
-    ; GFX6-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]]
-    ; GFX6-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE16]](s32)
+    ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]]
+    ; GFX6-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE14]](s32)
     ; GFX6-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]]
     ; GFX6-NEXT: [[XOR5:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]]
     ; GFX6-NEXT: [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64)
@@ -1086,96 +1078,94 @@ body: |
     ; GFX6-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[FPTOUI2]]
     ; GFX6-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]]
     ; GFX6-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]]
-    ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX6-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX6-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX6-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX6-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
     ; GFX6-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH16]]
     ; GFX6-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1)
-    ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX6-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX6-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX6-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
     ; GFX6-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH18]]
     ; GFX6-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1)
-    ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD21]]
+    ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD20]]
     ; GFX6-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
-    ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX6-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO54]]
-    ; GFX6-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO57]]
-    ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX6-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO57]]
     ; GFX6-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO56]]
     ; GFX6-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[UADDO56]]
-    ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE18]]
+    ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE16]]
     ; GFX6-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO56]]
-    ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[MUL24]]
-    ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD27]]
+    ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[MUL24]]
+    ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD25]]
     ; GFX6-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[MUL24]]
     ; GFX6-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX6-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
     ; GFX6-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH21]]
     ; GFX6-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1)
-    ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[ADD27]]
-    ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[MUL24]]
-    ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD27]]
+    ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[ADD25]]
+    ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[MUL24]]
+    ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD25]]
     ; GFX6-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX6-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
     ; GFX6-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH23]]
     ; GFX6-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1)
-    ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD28]]
+    ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD26]]
     ; GFX6-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
-    ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[ADD27]]
-    ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[ADD25]]
+    ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX6-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[UADDO66]]
-    ; GFX6-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO57]]
-    ; GFX6-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[C6]], [[UADDO69]]
+    ; GFX6-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[ADD29]], [[UADDO69]]
     ; GFX6-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX6-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX6-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDO68]]
-    ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE22]]
+    ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE18]]
     ; GFX6-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDO68]]
     ; GFX6-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX6-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1)
     ; GFX6-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH25]]
     ; GFX6-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1)
-    ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE22]]
+    ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE18]]
     ; GFX6-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDO68]]
-    ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE22]]
+    ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE18]]
     ; GFX6-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX6-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1)
     ; GFX6-NEXT: [[UADDO76:%[0-9]+]]:_(s32), [[UADDO77:%[0-9]+]]:_(s1) = G_UADDO [[UADDO74]], [[UMULH27]]
     ; GFX6-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO77]](s1)
-    ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX6-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD32]]
+    ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX6-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD30]]
     ; GFX6-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO79]](s1)
-    ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE22]]
-    ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
-    ; GFX6-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD35]](s32)
+    ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE18]]
+    ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
+    ; GFX6-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD33]](s32)
     ; GFX6-NEXT: [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64)
     ; GFX6-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[UADDO78]]
     ; GFX6-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV53]], [[UADDO78]]
-    ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD35]]
+    ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD33]]
     ; GFX6-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV52]], [[UADDO78]]
-    ; GFX6-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX6-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX6-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV48]], [[MUL33]]
-    ; GFX6-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD37]], [[USUBO11]]
-    ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD37]]
+    ; GFX6-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD35]], [[USUBO11]]
+    ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD35]]
     ; GFX6-NEXT: [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64)
     ; GFX6-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE12]](s32), [[UV55]]
     ; GFX6-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1)
@@ -1188,8 +1178,8 @@ body: |
     ; GFX6-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[USUBE14]], [[C6]], [[USUBO13]]
     ; GFX6-NEXT: [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX6-NEXT: [[UADDO80:%[0-9]+]]:_(s32), [[UADDO81:%[0-9]+]]:_(s1) = G_UADDO [[UADDO78]], [[UV56]]
-    ; GFX6-NEXT: [[UADDE24:%[0-9]+]]:_(s32), [[UADDE25:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV57]], [[UADDO81]]
-    ; GFX6-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE24]](s32)
+    ; GFX6-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV57]], [[UADDO81]]
+    ; GFX6-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE20]](s32)
     ; GFX6-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV55]]
     ; GFX6-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1)
     ; GFX6-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO12]](s32), [[UV54]]
@@ -1198,8 +1188,8 @@ body: |
     ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]]
     ; GFX6-NEXT: [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX6-NEXT: [[UADDO82:%[0-9]+]]:_(s32), [[UADDO83:%[0-9]+]]:_(s1) = G_UADDO [[UADDO80]], [[UV58]]
-    ; GFX6-NEXT: [[UADDE26:%[0-9]+]]:_(s32), [[UADDE27:%[0-9]+]]:_(s1) = G_UADDE [[UADDE24]], [[UV59]], [[UADDO83]]
-    ; GFX6-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE26]](s32)
+    ; GFX6-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[UV59]], [[UADDO83]]
+    ; GFX6-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE22]](s32)
     ; GFX6-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C6]]
     ; GFX6-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV10]], [[MV9]]
     ; GFX6-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C6]]
@@ -1284,72 +1274,70 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]]
     ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32)
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32)
     ; GFX8-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]]
     ; GFX8-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -1363,8 +1351,8 @@ body: |
     ; GFX8-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX8-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV26]]
-    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV27]], [[UADDO39]]
-    ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV27]], [[UADDO39]]
+    ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV25]]
     ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV24]]
@@ -1373,8 +1361,8 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX8-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV28]]
-    ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV29]], [[UADDO41]]
-    ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV29]], [[UADDO41]]
+    ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]]
     ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
@@ -1391,13 +1379,13 @@ body: |
     ; GFX8-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX8-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64)
     ; GFX8-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]]
-    ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]]
-    ; GFX8-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE14]](s32)
+    ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]]
+    ; GFX8-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE12]](s32)
     ; GFX8-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX8-NEXT: [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64)
     ; GFX8-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UV38]], [[UV40]]
-    ; GFX8-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]]
-    ; GFX8-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE16]](s32)
+    ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]]
+    ; GFX8-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE14]](s32)
     ; GFX8-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]]
     ; GFX8-NEXT: [[XOR5:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]]
     ; GFX8-NEXT: [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64)
@@ -1425,96 +1413,94 @@ body: |
     ; GFX8-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[FPTOUI2]]
     ; GFX8-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]]
     ; GFX8-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]]
-    ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX8-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX8-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX8-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX8-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
     ; GFX8-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH16]]
     ; GFX8-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1)
-    ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX8-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX8-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX8-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
     ; GFX8-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH18]]
     ; GFX8-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1)
-    ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD21]]
+    ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD20]]
     ; GFX8-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
-    ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX8-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO54]]
-    ; GFX8-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO57]]
-    ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX8-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO57]]
     ; GFX8-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO56]]
     ; GFX8-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[UADDO56]]
-    ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE18]]
+    ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE16]]
     ; GFX8-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO56]]
-    ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[MUL24]]
-    ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD27]]
+    ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[MUL24]]
+    ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD25]]
     ; GFX8-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[MUL24]]
     ; GFX8-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX8-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
     ; GFX8-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH21]]
     ; GFX8-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1)
-    ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[ADD27]]
-    ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[MUL24]]
-    ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD27]]
+    ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[ADD25]]
+    ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[MUL24]]
+    ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD25]]
     ; GFX8-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX8-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
     ; GFX8-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH23]]
     ; GFX8-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1)
-    ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD28]]
+    ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD26]]
     ; GFX8-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
-    ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[ADD27]]
-    ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[ADD25]]
+    ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX8-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[UADDO66]]
-    ; GFX8-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO57]]
-    ; GFX8-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[C6]], [[UADDO69]]
+    ; GFX8-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[ADD29]], [[UADDO69]]
     ; GFX8-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX8-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX8-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDO68]]
-    ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE22]]
+    ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE18]]
     ; GFX8-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDO68]]
     ; GFX8-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX8-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1)
     ; GFX8-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH25]]
     ; GFX8-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1)
-    ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE22]]
+    ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE18]]
     ; GFX8-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDO68]]
-    ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE22]]
+    ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE18]]
     ; GFX8-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX8-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1)
     ; GFX8-NEXT: [[UADDO76:%[0-9]+]]:_(s32), [[UADDO77:%[0-9]+]]:_(s1) = G_UADDO [[UADDO74]], [[UMULH27]]
     ; GFX8-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO77]](s1)
-    ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX8-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD32]]
+    ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX8-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD30]]
     ; GFX8-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO79]](s1)
-    ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE22]]
-    ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
-    ; GFX8-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD35]](s32)
+    ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE18]]
+    ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
+    ; GFX8-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD33]](s32)
     ; GFX8-NEXT: [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64)
     ; GFX8-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[UADDO78]]
     ; GFX8-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV53]], [[UADDO78]]
-    ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD35]]
+    ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD33]]
     ; GFX8-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV52]], [[UADDO78]]
-    ; GFX8-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX8-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX8-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV48]], [[MUL33]]
-    ; GFX8-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD37]], [[USUBO11]]
-    ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD37]]
+    ; GFX8-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD35]], [[USUBO11]]
+    ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD35]]
     ; GFX8-NEXT: [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64)
     ; GFX8-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE12]](s32), [[UV55]]
     ; GFX8-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1)
@@ -1527,8 +1513,8 @@ body: |
     ; GFX8-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[USUBE14]], [[C6]], [[USUBO13]]
     ; GFX8-NEXT: [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX8-NEXT: [[UADDO80:%[0-9]+]]:_(s32), [[UADDO81:%[0-9]+]]:_(s1) = G_UADDO [[UADDO78]], [[UV56]]
-    ; GFX8-NEXT: [[UADDE24:%[0-9]+]]:_(s32), [[UADDE25:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV57]], [[UADDO81]]
-    ; GFX8-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE24]](s32)
+    ; GFX8-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV57]], [[UADDO81]]
+    ; GFX8-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE20]](s32)
     ; GFX8-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV55]]
     ; GFX8-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1)
     ; GFX8-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO12]](s32), [[UV54]]
@@ -1537,8 +1523,8 @@ body: |
     ; GFX8-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]]
     ; GFX8-NEXT: [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX8-NEXT: [[UADDO82:%[0-9]+]]:_(s32), [[UADDO83:%[0-9]+]]:_(s1) = G_UADDO [[UADDO80]], [[UV58]]
-    ; GFX8-NEXT: [[UADDE26:%[0-9]+]]:_(s32), [[UADDE27:%[0-9]+]]:_(s1) = G_UADDE [[UADDE24]], [[UV59]], [[UADDO83]]
-    ; GFX8-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE26]](s32)
+    ; GFX8-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[UV59]], [[UADDO83]]
+    ; GFX8-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE22]](s32)
     ; GFX8-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C6]]
     ; GFX8-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV10]], [[MV9]]
     ; GFX8-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C6]]
@@ -1623,72 +1609,70 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]]
     ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32)
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32)
     ; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]]
     ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]]
     ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -1702,8 +1686,8 @@ body: |
     ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV26]]
-    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV27]], [[UADDO39]]
-    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV27]], [[UADDO39]]
+    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV25]]
     ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV24]]
@@ -1712,8 +1696,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX9-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV28]]
-    ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV29]], [[UADDO41]]
-    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV29]], [[UADDO41]]
+    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]]
     ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
@@ -1730,13 +1714,13 @@ body: |
     ; GFX9-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX9-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64)
     ; GFX9-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]]
-    ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]]
-    ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE14]](s32)
+    ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO43]]
+    ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO42]](s32), [[UADDE12]](s32)
     ; GFX9-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX9-NEXT: [[UV40:%[0-9]+]]:_(s32), [[UV41:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64)
     ; GFX9-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UV38]], [[UV40]]
-    ; GFX9-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]]
-    ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE16]](s32)
+    ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UV39]], [[UV41]], [[UADDO45]]
+    ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO44]](s32), [[UADDE14]](s32)
     ; GFX9-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]]
     ; GFX9-NEXT: [[XOR5:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]]
     ; GFX9-NEXT: [[UV42:%[0-9]+]]:_(s32), [[UV43:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64)
@@ -1764,96 +1748,94 @@ body: |
     ; GFX9-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[FPTOUI2]]
     ; GFX9-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]]
     ; GFX9-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]]
-    ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX9-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX9-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX9-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX9-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
     ; GFX9-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH16]]
     ; GFX9-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1)
-    ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX9-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX9-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX9-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
     ; GFX9-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH18]]
     ; GFX9-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1)
-    ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD21]]
+    ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD20]]
     ; GFX9-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
-    ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX9-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO54]]
-    ; GFX9-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO57]]
-    ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX9-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO57]]
     ; GFX9-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO56]]
     ; GFX9-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE10]], [[UADDO56]]
-    ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE18]]
+    ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE16]]
     ; GFX9-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO56]]
-    ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[MUL24]]
-    ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD27]]
+    ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[MUL24]]
+    ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO56]], [[ADD25]]
     ; GFX9-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[MUL24]]
     ; GFX9-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX9-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
     ; GFX9-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH21]]
     ; GFX9-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1)
-    ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE18]], [[ADD27]]
-    ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[MUL24]]
-    ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD27]]
+    ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE16]], [[ADD25]]
+    ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[MUL24]]
+    ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO56]], [[ADD25]]
     ; GFX9-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX9-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
     ; GFX9-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH23]]
     ; GFX9-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1)
-    ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD28]]
+    ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD26]]
     ; GFX9-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
-    ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE18]], [[ADD27]]
-    ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE16]], [[ADD25]]
+    ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX9-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[UADDO66]]
-    ; GFX9-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO57]]
-    ; GFX9-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[C6]], [[UADDO69]]
+    ; GFX9-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[ADD29]], [[UADDO69]]
     ; GFX9-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX9-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX9-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDO68]]
-    ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE22]]
+    ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV50]], [[UADDE18]]
     ; GFX9-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDO68]]
     ; GFX9-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX9-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1)
     ; GFX9-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH25]]
     ; GFX9-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1)
-    ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE22]]
+    ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV51]], [[UADDE18]]
     ; GFX9-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDO68]]
-    ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE22]]
+    ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV50]], [[UADDE18]]
     ; GFX9-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX9-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1)
     ; GFX9-NEXT: [[UADDO76:%[0-9]+]]:_(s32), [[UADDO77:%[0-9]+]]:_(s1) = G_UADDO [[UADDO74]], [[UMULH27]]
     ; GFX9-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO77]](s1)
-    ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX9-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD32]]
+    ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX9-NEXT: [[UADDO78:%[0-9]+]]:_(s32), [[UADDO79:%[0-9]+]]:_(s1) = G_UADDO [[UADDO76]], [[ADD30]]
     ; GFX9-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO79]](s1)
-    ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE22]]
-    ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
-    ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD35]](s32)
+    ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV51]], [[UADDE18]]
+    ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
+    ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO78]](s32), [[ADD33]](s32)
     ; GFX9-NEXT: [[UV52:%[0-9]+]]:_(s32), [[UV53:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64)
     ; GFX9-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[UADDO78]]
     ; GFX9-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV53]], [[UADDO78]]
-    ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD35]]
+    ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV52]], [[ADD33]]
     ; GFX9-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV52]], [[UADDO78]]
-    ; GFX9-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX9-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX9-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV48]], [[MUL33]]
-    ; GFX9-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD37]], [[USUBO11]]
-    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD37]]
+    ; GFX9-NEXT: [[USUBE12:%[0-9]+]]:_(s32), [[USUBE13:%[0-9]+]]:_(s1) = G_USUBE [[UV49]], [[ADD35]], [[USUBO11]]
+    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV49]], [[ADD35]]
     ; GFX9-NEXT: [[UV54:%[0-9]+]]:_(s32), [[UV55:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR5]](s64)
     ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE12]](s32), [[UV55]]
     ; GFX9-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1)
@@ -1866,8 +1848,8 @@ body: |
     ; GFX9-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[USUBE14]], [[C6]], [[USUBO13]]
     ; GFX9-NEXT: [[UV56:%[0-9]+]]:_(s32), [[UV57:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX9-NEXT: [[UADDO80:%[0-9]+]]:_(s32), [[UADDO81:%[0-9]+]]:_(s1) = G_UADDO [[UADDO78]], [[UV56]]
-    ; GFX9-NEXT: [[UADDE24:%[0-9]+]]:_(s32), [[UADDE25:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV57]], [[UADDO81]]
-    ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE24]](s32)
+    ; GFX9-NEXT: [[UADDE20:%[0-9]+]]:_(s32), [[UADDE21:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV57]], [[UADDO81]]
+    ; GFX9-NEXT: [[MV9:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO80]](s32), [[UADDE20]](s32)
     ; GFX9-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV55]]
     ; GFX9-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1)
     ; GFX9-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO12]](s32), [[UV54]]
@@ -1876,8 +1858,8 @@ body: |
     ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]]
     ; GFX9-NEXT: [[UV58:%[0-9]+]]:_(s32), [[UV59:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX9-NEXT: [[UADDO82:%[0-9]+]]:_(s32), [[UADDO83:%[0-9]+]]:_(s1) = G_UADDO [[UADDO80]], [[UV58]]
-    ; GFX9-NEXT: [[UADDE26:%[0-9]+]]:_(s32), [[UADDE27:%[0-9]+]]:_(s1) = G_UADDE [[UADDE24]], [[UV59]], [[UADDO83]]
-    ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE26]](s32)
+    ; GFX9-NEXT: [[UADDE22:%[0-9]+]]:_(s32), [[UADDE23:%[0-9]+]]:_(s1) = G_UADDE [[UADDE20]], [[UV59]], [[UADDO83]]
+    ; GFX9-NEXT: [[MV10:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO82]](s32), [[UADDE22]](s32)
     ; GFX9-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C6]]
     ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV10]], [[MV9]]
     ; GFX9-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C6]]
@@ -2608,72 +2590,70 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32)
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32)
     ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
     ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -2687,8 +2667,8 @@ body: |
     ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX6-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]]
-    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]]
-    ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]]
+    ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]]
     ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]]
@@ -2697,8 +2677,8 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX6-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]]
-    ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]]
-    ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]]
+    ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]]
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
@@ -2782,72 +2762,70 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32)
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32)
     ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -2861,8 +2839,8 @@ body: |
     ; GFX8-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX8-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]]
-    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]]
-    ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]]
+    ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]]
     ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]]
@@ -2871,8 +2849,8 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX8-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]]
-    ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]]
-    ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]]
+    ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]]
     ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
@@ -2956,72 +2934,70 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD16]](s32)
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[ADD15]](s32)
     ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
     ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -3035,8 +3011,8 @@ body: |
     ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UADDO36]], [[UV22]]
-    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV23]], [[UADDO39]]
-    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV23]], [[UADDO39]]
+    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV21]]
     ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV20]]
@@ -3045,8 +3021,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UV24]]
-    ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UADDE10]], [[UV25]], [[UADDO41]]
-    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[UV25]], [[UADDO41]]
+    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV4]], [[MV3]]
     ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
index 20bceef5e179..cc23f0e64e56 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-srem.mir
@@ -382,71 +382,69 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
@@ -549,71 +547,69 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
@@ -716,71 +712,69 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
@@ -897,71 +891,69 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]]
     ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX6-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]]
     ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX6-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]]
@@ -999,13 +991,13 @@ body: |
     ; GFX6-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX6-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64)
     ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UV30]], [[UV32]]
-    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]]
-    ; GFX6-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]]
+    ; GFX6-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX6-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX6-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64)
     ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]]
-    ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]]
-    ; GFX6-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]]
+    ; GFX6-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX6-NEXT: [[XOR3:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]]
     ; GFX6-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]]
     ; GFX6-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
@@ -1033,95 +1025,93 @@ body: |
     ; GFX6-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[FPTOUI2]]
     ; GFX6-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[FPTOUI3]]
     ; GFX6-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[FPTOUI2]]
-    ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX6-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX6-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX6-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX6-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1)
     ; GFX6-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH16]]
     ; GFX6-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1)
-    ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX6-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX6-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX6-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
     ; GFX6-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH18]]
     ; GFX6-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1)
-    ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX6-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD21]]
+    ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX6-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD20]]
     ; GFX6-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
-    ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX6-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO50]]
-    ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO53]]
-    ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO53]]
     ; GFX6-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDO52]]
     ; GFX6-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[UADDO52]]
-    ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE14]]
+    ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE12]]
     ; GFX6-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[UADDO52]]
-    ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[MUL24]]
-    ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD27]]
+    ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[MUL24]]
+    ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD25]]
     ; GFX6-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[MUL24]]
     ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX6-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
     ; GFX6-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH21]]
     ; GFX6-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1)
-    ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[ADD27]]
-    ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[MUL24]]
-    ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD27]]
+    ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[ADD25]]
+    ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[MUL24]]
+    ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD25]]
     ; GFX6-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX6-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
     ; GFX6-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH23]]
     ; GFX6-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1)
-    ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX6-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD28]]
+    ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX6-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD26]]
     ; GFX6-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
-    ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[ADD27]]
-    ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[ADD25]]
+    ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX6-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[UADDO62]]
-    ; GFX6-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO53]]
-    ; GFX6-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[C6]], [[UADDO65]]
+    ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[ADD29]], [[UADDO65]]
     ; GFX6-NEXT: [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64)
     ; GFX6-NEXT: [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64)
     ; GFX6-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDO64]]
-    ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE18]]
+    ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE14]]
     ; GFX6-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDO64]]
     ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX6-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
     ; GFX6-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH25]]
     ; GFX6-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1)
-    ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE18]]
+    ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE14]]
     ; GFX6-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDO64]]
-    ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE18]]
+    ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE14]]
     ; GFX6-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX6-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1)
     ; GFX6-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH27]]
     ; GFX6-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1)
-    ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX6-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD32]]
+    ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX6-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD30]]
     ; GFX6-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1)
-    ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE18]]
-    ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
+    ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE14]]
+    ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
     ; GFX6-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX6-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[UADDO74]]
     ; GFX6-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV49]], [[UADDO74]]
-    ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD35]]
+    ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD33]]
     ; GFX6-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV48]], [[UADDO74]]
-    ; GFX6-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX6-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX6-NEXT: [[USUBO12:%[0-9]+]]:_(s32), [[USUBO13:%[0-9]+]]:_(s1) = G_USUBO [[UV44]], [[MUL33]]
-    ; GFX6-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD37]], [[USUBO13]]
-    ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD37]]
+    ; GFX6-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD35]], [[USUBO13]]
+    ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD35]]
     ; GFX6-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO12]](s32), [[USUBE16]](s32)
     ; GFX6-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX6-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV51]]
@@ -1227,71 +1217,69 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]]
     ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX8-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]]
     ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX8-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]]
@@ -1329,13 +1317,13 @@ body: |
     ; GFX8-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX8-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64)
     ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UV30]], [[UV32]]
-    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]]
-    ; GFX8-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]]
+    ; GFX8-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX8-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX8-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64)
     ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]]
-    ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]]
-    ; GFX8-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]]
+    ; GFX8-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX8-NEXT: [[XOR3:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]]
     ; GFX8-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]]
     ; GFX8-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
@@ -1363,95 +1351,93 @@ body: |
     ; GFX8-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[FPTOUI2]]
     ; GFX8-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[FPTOUI3]]
     ; GFX8-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[FPTOUI2]]
-    ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX8-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX8-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX8-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX8-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1)
     ; GFX8-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH16]]
     ; GFX8-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1)
-    ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX8-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX8-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX8-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
     ; GFX8-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH18]]
     ; GFX8-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1)
-    ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX8-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD21]]
+    ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX8-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD20]]
     ; GFX8-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
-    ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX8-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO50]]
-    ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO53]]
-    ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO53]]
     ; GFX8-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDO52]]
     ; GFX8-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[UADDO52]]
-    ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE14]]
+    ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE12]]
     ; GFX8-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[UADDO52]]
-    ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[MUL24]]
-    ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD27]]
+    ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[MUL24]]
+    ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD25]]
     ; GFX8-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[MUL24]]
     ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX8-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
     ; GFX8-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH21]]
     ; GFX8-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1)
-    ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[ADD27]]
-    ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[MUL24]]
-    ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD27]]
+    ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[ADD25]]
+    ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[MUL24]]
+    ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD25]]
     ; GFX8-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX8-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
     ; GFX8-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH23]]
     ; GFX8-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1)
-    ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX8-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD28]]
+    ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX8-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD26]]
     ; GFX8-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
-    ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[ADD27]]
-    ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[ADD25]]
+    ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX8-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[UADDO62]]
-    ; GFX8-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO53]]
-    ; GFX8-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[C6]], [[UADDO65]]
+    ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[ADD29]], [[UADDO65]]
     ; GFX8-NEXT: [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64)
     ; GFX8-NEXT: [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64)
     ; GFX8-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDO64]]
-    ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE18]]
+    ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE14]]
     ; GFX8-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDO64]]
     ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX8-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
     ; GFX8-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH25]]
     ; GFX8-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1)
-    ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE18]]
+    ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE14]]
     ; GFX8-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDO64]]
-    ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE18]]
+    ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE14]]
     ; GFX8-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX8-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1)
     ; GFX8-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH27]]
     ; GFX8-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1)
-    ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX8-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD32]]
+    ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX8-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD30]]
     ; GFX8-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1)
-    ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE18]]
-    ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
+    ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE14]]
+    ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
     ; GFX8-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX8-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[UADDO74]]
     ; GFX8-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV49]], [[UADDO74]]
-    ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD35]]
+    ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD33]]
     ; GFX8-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV48]], [[UADDO74]]
-    ; GFX8-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX8-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX8-NEXT: [[USUBO12:%[0-9]+]]:_(s32), [[USUBO13:%[0-9]+]]:_(s1) = G_USUBO [[UV44]], [[MUL33]]
-    ; GFX8-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD37]], [[USUBO13]]
-    ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD37]]
+    ; GFX8-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD35]], [[USUBO13]]
+    ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD35]]
     ; GFX8-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO12]](s32), [[USUBE16]](s32)
     ; GFX8-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX8-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV51]]
@@ -1557,71 +1543,69 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDO26]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE8]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV20]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDO26]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE8]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV21]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDO26]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE8]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV20]], [[UADDE6]]
     ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE8]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV21]], [[UADDE6]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX9-NEXT: [[UV22:%[0-9]+]]:_(s32), [[UV23:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[UADDO36]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV23]], [[UADDO36]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV22]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV22]], [[UADDO36]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV18]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV19]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV19]], [[ADD17]]
     ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV25]]
@@ -1659,13 +1643,13 @@ body: |
     ; GFX9-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX9-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR2]](s64)
     ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[UV30]], [[UV32]]
-    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]]
-    ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE10]](s32)
+    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UV31]], [[UV33]], [[UADDO39]]
+    ; GFX9-NEXT: [[MV6:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO38]](s32), [[UADDE8]](s32)
     ; GFX9-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX9-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR3]](s64)
     ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UV34]], [[UV36]]
-    ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]]
-    ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE12]](s32)
+    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UV35]], [[UV37]], [[UADDO41]]
+    ; GFX9-NEXT: [[MV7:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO40]](s32), [[UADDE10]](s32)
     ; GFX9-NEXT: [[XOR3:%[0-9]+]]:_(s64) = G_XOR [[MV6]], [[ASHR2]]
     ; GFX9-NEXT: [[XOR4:%[0-9]+]]:_(s64) = G_XOR [[MV7]], [[ASHR3]]
     ; GFX9-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
@@ -1693,95 +1677,93 @@ body: |
     ; GFX9-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[FPTOUI2]]
     ; GFX9-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[FPTOUI3]]
     ; GFX9-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[FPTOUI2]]
-    ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX9-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX9-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX9-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX9-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1)
     ; GFX9-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH16]]
     ; GFX9-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1)
-    ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX9-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX9-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX9-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
     ; GFX9-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH18]]
     ; GFX9-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1)
-    ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX9-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD21]]
+    ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX9-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[ADD20]]
     ; GFX9-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
-    ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX9-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO50]]
-    ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO53]]
-    ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO53]]
     ; GFX9-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDO52]]
     ; GFX9-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE14]], [[UADDO52]]
-    ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE14]]
+    ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO10]], [[UADDE12]]
     ; GFX9-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO10]], [[UADDO52]]
-    ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[MUL24]]
-    ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD27]]
+    ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[MUL24]]
+    ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO52]], [[ADD25]]
     ; GFX9-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[MUL24]]
     ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX9-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
     ; GFX9-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH21]]
     ; GFX9-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1)
-    ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE14]], [[ADD27]]
-    ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[MUL24]]
-    ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD27]]
+    ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE12]], [[ADD25]]
+    ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[MUL24]]
+    ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO52]], [[ADD25]]
     ; GFX9-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX9-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
     ; GFX9-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH23]]
     ; GFX9-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1)
-    ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX9-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD28]]
+    ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX9-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[UADDO60]], [[ADD26]]
     ; GFX9-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
-    ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE14]], [[ADD27]]
-    ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE12]], [[ADD25]]
+    ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX9-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[UADDO62]]
-    ; GFX9-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO53]]
-    ; GFX9-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[C6]], [[UADDO65]]
+    ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[ADD29]], [[UADDO65]]
     ; GFX9-NEXT: [[UV44:%[0-9]+]]:_(s32), [[UV45:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64)
     ; GFX9-NEXT: [[UV46:%[0-9]+]]:_(s32), [[UV47:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR3]](s64)
     ; GFX9-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDO64]]
-    ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE18]]
+    ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV46]], [[UADDE14]]
     ; GFX9-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDO64]]
     ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX9-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
     ; GFX9-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH25]]
     ; GFX9-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1)
-    ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE18]]
+    ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV47]], [[UADDE14]]
     ; GFX9-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDO64]]
-    ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE18]]
+    ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV46]], [[UADDE14]]
     ; GFX9-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX9-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1)
     ; GFX9-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UMULH27]]
     ; GFX9-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO73]](s1)
-    ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX9-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD32]]
+    ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX9-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[ADD30]]
     ; GFX9-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO75]](s1)
-    ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE18]]
-    ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
+    ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV47]], [[UADDE14]]
+    ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
     ; GFX9-NEXT: [[UV48:%[0-9]+]]:_(s32), [[UV49:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX9-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[UADDO74]]
     ; GFX9-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV49]], [[UADDO74]]
-    ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD35]]
+    ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV48]], [[ADD33]]
     ; GFX9-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV48]], [[UADDO74]]
-    ; GFX9-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX9-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX9-NEXT: [[USUBO12:%[0-9]+]]:_(s32), [[USUBO13:%[0-9]+]]:_(s1) = G_USUBO [[UV44]], [[MUL33]]
-    ; GFX9-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD37]], [[USUBO13]]
-    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD37]]
+    ; GFX9-NEXT: [[USUBE16:%[0-9]+]]:_(s32), [[USUBE17:%[0-9]+]]:_(s1) = G_USUBE [[UV45]], [[ADD35]], [[USUBO13]]
+    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV45]], [[ADD35]]
     ; GFX9-NEXT: [[MV8:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO12]](s32), [[USUBE16]](s32)
     ; GFX9-NEXT: [[UV50:%[0-9]+]]:_(s32), [[UV51:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR4]](s64)
     ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE16]](s32), [[UV51]]
@@ -2482,71 +2464,69 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
@@ -2651,71 +2631,69 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]
@@ -2820,71 +2798,69 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO12]]
     ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO15]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO14]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE4]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO14]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO14]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO20]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO23]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[UADDO22]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO14]], [[UADDO24]]
-    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO15]]
-    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[C6]], [[UADDO27]]
+    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD11]], [[UADDO27]]
     ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDO26]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE8]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV16]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDO26]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE8]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV17]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDO26]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE8]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV16]], [[UADDE6]]
     ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
     ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE8]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV17]], [[UADDE6]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[UADDO36]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV19]], [[UADDO36]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV18]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV18]], [[UADDO36]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV14]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV15]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV15]], [[ADD17]]
     ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[XOR1]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV21]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
index e81be9cb654b..9ce1c81a5393 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-udiv.mir
@@ -313,72 +313,70 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32)
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32)
     ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
     ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -392,8 +390,8 @@ body: |
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]]
-    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]]
-    ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32)
+    ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]]
+    ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32)
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]]
     ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]]
@@ -402,8 +400,8 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]]
-    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]]
-    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32)
+    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]]
+    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32)
     ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]]
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
@@ -463,72 +461,70 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32)
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32)
     ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -542,8 +538,8 @@ body: |
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]]
-    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]]
-    ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32)
+    ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]]
+    ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32)
     ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]]
     ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]]
@@ -552,8 +548,8 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]]
-    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]]
-    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32)
+    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]]
+    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32)
     ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]]
     ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
@@ -613,72 +609,70 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32)
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32)
     ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
     ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -692,8 +686,8 @@ body: |
     ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]]
-    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]]
-    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32)
+    ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]]
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32)
     ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]]
     ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]]
@@ -702,8 +696,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]]
-    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]]
-    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32)
+    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]]
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32)
     ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]]
     ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
@@ -777,72 +771,70 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]]
     ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32)
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32)
     ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]]
     ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]]
     ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -856,8 +848,8 @@ body: |
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV18]]
-    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV19]], [[UADDO35]]
-    ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32)
+    ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV19]], [[UADDO35]]
+    ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32)
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV17]]
     ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV16]]
@@ -866,8 +858,8 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX6-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV20]]
-    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV21]], [[UADDO37]]
-    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32)
+    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV21]], [[UADDO37]]
+    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32)
     ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]]
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
@@ -897,96 +889,94 @@ body: |
     ; GFX6-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[FPTOUI2]]
     ; GFX6-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[FPTOUI3]]
     ; GFX6-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[FPTOUI2]]
-    ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX6-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX6-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX6-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1)
     ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH16]]
     ; GFX6-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1)
-    ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX6-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX6-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX6-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1)
     ; GFX6-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH18]]
     ; GFX6-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1)
-    ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX6-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD21]]
+    ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX6-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD20]]
     ; GFX6-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
-    ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX6-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO46]]
-    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO49]]
-    ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO49]]
     ; GFX6-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDO48]]
     ; GFX6-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[UADDO48]]
-    ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE10]]
+    ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE8]]
     ; GFX6-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[UADDO48]]
-    ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[MUL24]]
-    ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD27]]
+    ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[MUL24]]
+    ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD25]]
     ; GFX6-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[MUL24]]
     ; GFX6-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX6-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
     ; GFX6-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH21]]
     ; GFX6-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1)
-    ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[ADD27]]
-    ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[MUL24]]
-    ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD27]]
+    ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[ADD25]]
+    ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[MUL24]]
+    ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD25]]
     ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX6-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
     ; GFX6-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH23]]
     ; GFX6-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1)
-    ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX6-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD28]]
+    ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX6-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD26]]
     ; GFX6-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
-    ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[ADD27]]
-    ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[ADD25]]
+    ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX6-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[UADDO58]]
-    ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO49]]
-    ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[C5]], [[UADDO61]]
+    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[ADD29]], [[UADDO61]]
     ; GFX6-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX6-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX6-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDO60]]
-    ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE14]]
+    ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE10]]
     ; GFX6-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDO60]]
     ; GFX6-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX6-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
     ; GFX6-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH25]]
     ; GFX6-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1)
-    ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE14]]
+    ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE10]]
     ; GFX6-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDO60]]
-    ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE14]]
+    ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE10]]
     ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX6-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
     ; GFX6-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH27]]
     ; GFX6-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1)
-    ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX6-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD32]]
+    ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX6-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD30]]
     ; GFX6-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1)
-    ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE14]]
-    ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
-    ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD35]](s32)
+    ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE10]]
+    ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
+    ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD33]](s32)
     ; GFX6-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX6-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[UADDO70]]
     ; GFX6-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV33]], [[UADDO70]]
-    ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD35]]
+    ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD33]]
     ; GFX6-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV32]], [[UADDO70]]
-    ; GFX6-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX6-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX6-NEXT: [[USUBO8:%[0-9]+]]:_(s32), [[USUBO9:%[0-9]+]]:_(s1) = G_USUBO [[UV28]], [[MUL33]]
-    ; GFX6-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD37]], [[USUBO9]]
-    ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD37]]
+    ; GFX6-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD35]], [[USUBO9]]
+    ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD35]]
     ; GFX6-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX6-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE10]](s32), [[UV35]]
     ; GFX6-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1)
@@ -999,8 +989,8 @@ body: |
     ; GFX6-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[USUBE12]], [[C5]], [[USUBO11]]
     ; GFX6-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX6-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UV36]]
-    ; GFX6-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV37]], [[UADDO73]]
-    ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE16]](s32)
+    ; GFX6-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV37]], [[UADDO73]]
+    ; GFX6-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE12]](s32)
     ; GFX6-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV35]]
     ; GFX6-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1)
     ; GFX6-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO10]](s32), [[UV34]]
@@ -1009,8 +999,8 @@ body: |
     ; GFX6-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]]
     ; GFX6-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX6-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[UV38]]
-    ; GFX6-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[UV39]], [[UADDO75]]
-    ; GFX6-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE18]](s32)
+    ; GFX6-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[UV39]], [[UADDO75]]
+    ; GFX6-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE14]](s32)
     ; GFX6-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C5]]
     ; GFX6-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV5]], [[MV4]]
     ; GFX6-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C5]]
@@ -1073,72 +1063,70 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]]
     ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32)
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32)
     ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]]
     ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -1152,8 +1140,8 @@ body: |
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV18]]
-    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV19]], [[UADDO35]]
-    ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32)
+    ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV19]], [[UADDO35]]
+    ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32)
     ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV17]]
     ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV16]]
@@ -1162,8 +1150,8 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX8-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV20]]
-    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV21]], [[UADDO37]]
-    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32)
+    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV21]], [[UADDO37]]
+    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32)
     ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]]
     ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
@@ -1193,96 +1181,94 @@ body: |
     ; GFX8-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[FPTOUI2]]
     ; GFX8-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[FPTOUI3]]
     ; GFX8-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[FPTOUI2]]
-    ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX8-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX8-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX8-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1)
     ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH16]]
     ; GFX8-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1)
-    ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX8-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX8-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX8-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1)
     ; GFX8-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH18]]
     ; GFX8-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1)
-    ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX8-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD21]]
+    ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX8-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD20]]
     ; GFX8-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
-    ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX8-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO46]]
-    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO49]]
-    ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO49]]
     ; GFX8-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDO48]]
     ; GFX8-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[UADDO48]]
-    ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE10]]
+    ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE8]]
     ; GFX8-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[UADDO48]]
-    ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[MUL24]]
-    ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD27]]
+    ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[MUL24]]
+    ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD25]]
     ; GFX8-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[MUL24]]
     ; GFX8-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX8-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
     ; GFX8-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH21]]
     ; GFX8-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1)
-    ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[ADD27]]
-    ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[MUL24]]
-    ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD27]]
+    ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[ADD25]]
+    ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[MUL24]]
+    ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD25]]
     ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX8-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
     ; GFX8-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH23]]
     ; GFX8-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1)
-    ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX8-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD28]]
+    ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX8-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD26]]
     ; GFX8-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
-    ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[ADD27]]
-    ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[ADD25]]
+    ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX8-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[UADDO58]]
-    ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO49]]
-    ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[C5]], [[UADDO61]]
+    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[ADD29]], [[UADDO61]]
     ; GFX8-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX8-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX8-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDO60]]
-    ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE14]]
+    ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE10]]
     ; GFX8-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDO60]]
     ; GFX8-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX8-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
     ; GFX8-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH25]]
     ; GFX8-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1)
-    ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE14]]
+    ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE10]]
     ; GFX8-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDO60]]
-    ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE14]]
+    ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE10]]
     ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX8-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
     ; GFX8-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH27]]
     ; GFX8-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1)
-    ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX8-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD32]]
+    ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX8-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD30]]
     ; GFX8-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1)
-    ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE14]]
-    ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
-    ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD35]](s32)
+    ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE10]]
+    ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
+    ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD33]](s32)
     ; GFX8-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX8-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[UADDO70]]
     ; GFX8-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV33]], [[UADDO70]]
-    ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD35]]
+    ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD33]]
     ; GFX8-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV32]], [[UADDO70]]
-    ; GFX8-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX8-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX8-NEXT: [[USUBO8:%[0-9]+]]:_(s32), [[USUBO9:%[0-9]+]]:_(s1) = G_USUBO [[UV28]], [[MUL33]]
-    ; GFX8-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD37]], [[USUBO9]]
-    ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD37]]
+    ; GFX8-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD35]], [[USUBO9]]
+    ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD35]]
     ; GFX8-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX8-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE10]](s32), [[UV35]]
     ; GFX8-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1)
@@ -1295,8 +1281,8 @@ body: |
     ; GFX8-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[USUBE12]], [[C5]], [[USUBO11]]
     ; GFX8-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX8-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UV36]]
-    ; GFX8-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV37]], [[UADDO73]]
-    ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE16]](s32)
+    ; GFX8-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV37]], [[UADDO73]]
+    ; GFX8-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE12]](s32)
     ; GFX8-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV35]]
     ; GFX8-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1)
     ; GFX8-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO10]](s32), [[UV34]]
@@ -1305,8 +1291,8 @@ body: |
     ; GFX8-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]]
     ; GFX8-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX8-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[UV38]]
-    ; GFX8-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[UV39]], [[UADDO75]]
-    ; GFX8-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE18]](s32)
+    ; GFX8-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[UV39]], [[UADDO75]]
+    ; GFX8-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE14]](s32)
     ; GFX8-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C5]]
     ; GFX8-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV5]], [[MV4]]
     ; GFX8-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C5]]
@@ -1369,72 +1355,70 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]]
     ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32)
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32)
     ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]]
     ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]]
     ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -1448,8 +1432,8 @@ body: |
     ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV18]]
-    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV19]], [[UADDO35]]
-    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32)
+    ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV19]], [[UADDO35]]
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32)
     ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV17]]
     ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV16]]
@@ -1458,8 +1442,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX9-NEXT: [[UV20:%[0-9]+]]:_(s32), [[UV21:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV20]]
-    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV21]], [[UADDO37]]
-    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32)
+    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV21]], [[UADDO37]]
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32)
     ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C5]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]]
     ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C5]]
@@ -1489,96 +1473,94 @@ body: |
     ; GFX9-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[FPTOUI2]]
     ; GFX9-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[FPTOUI3]]
     ; GFX9-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[FPTOUI2]]
-    ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX9-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX9-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX9-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1)
     ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH16]]
     ; GFX9-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1)
-    ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX9-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX9-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX9-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1)
     ; GFX9-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[UADDO42]], [[UMULH18]]
     ; GFX9-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO45]](s1)
-    ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX9-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD21]]
+    ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX9-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[ADD20]]
     ; GFX9-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
-    ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX9-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO46]]
-    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO49]]
-    ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO49]]
     ; GFX9-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDO48]]
     ; GFX9-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE8]], [[UADDO48]]
-    ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE10]]
+    ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO6]], [[UADDE8]]
     ; GFX9-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO6]], [[UADDO48]]
-    ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[MUL24]]
-    ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD27]]
+    ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[MUL24]]
+    ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO48]], [[ADD25]]
     ; GFX9-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[MUL24]]
     ; GFX9-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX9-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
     ; GFX9-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH21]]
     ; GFX9-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1)
-    ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE10]], [[ADD27]]
-    ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[MUL24]]
-    ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD27]]
+    ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE8]], [[ADD25]]
+    ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[MUL24]]
+    ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO48]], [[ADD25]]
     ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX9-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
     ; GFX9-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO54]], [[UMULH23]]
     ; GFX9-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO57]](s1)
-    ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX9-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD28]]
+    ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX9-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[UADDO56]], [[ADD26]]
     ; GFX9-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
-    ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE10]], [[ADD27]]
-    ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE8]], [[ADD25]]
+    ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX9-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO48]], [[UADDO58]]
-    ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO49]]
-    ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[C5]], [[UADDO61]]
+    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[ADD29]], [[UADDO61]]
     ; GFX9-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX9-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX9-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDO60]]
-    ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE14]]
+    ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV30]], [[UADDE10]]
     ; GFX9-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDO60]]
     ; GFX9-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX9-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
     ; GFX9-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH25]]
     ; GFX9-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1)
-    ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE14]]
+    ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV31]], [[UADDE10]]
     ; GFX9-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDO60]]
-    ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE14]]
+    ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV30]], [[UADDE10]]
     ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX9-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
     ; GFX9-NEXT: [[UADDO68:%[0-9]+]]:_(s32), [[UADDO69:%[0-9]+]]:_(s1) = G_UADDO [[UADDO66]], [[UMULH27]]
     ; GFX9-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO69]](s1)
-    ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX9-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD32]]
+    ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX9-NEXT: [[UADDO70:%[0-9]+]]:_(s32), [[UADDO71:%[0-9]+]]:_(s1) = G_UADDO [[UADDO68]], [[ADD30]]
     ; GFX9-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO71]](s1)
-    ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE14]]
-    ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
-    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD35]](s32)
+    ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV31]], [[UADDE10]]
+    ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
+    ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO70]](s32), [[ADD33]](s32)
     ; GFX9-NEXT: [[UV32:%[0-9]+]]:_(s32), [[UV33:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX9-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[UADDO70]]
     ; GFX9-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV33]], [[UADDO70]]
-    ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD35]]
+    ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV32]], [[ADD33]]
     ; GFX9-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV32]], [[UADDO70]]
-    ; GFX9-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX9-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX9-NEXT: [[USUBO8:%[0-9]+]]:_(s32), [[USUBO9:%[0-9]+]]:_(s1) = G_USUBO [[UV28]], [[MUL33]]
-    ; GFX9-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD37]], [[USUBO9]]
-    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD37]]
+    ; GFX9-NEXT: [[USUBE10:%[0-9]+]]:_(s32), [[USUBE11:%[0-9]+]]:_(s1) = G_USUBE [[UV29]], [[ADD35]], [[USUBO9]]
+    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV29]], [[ADD35]]
     ; GFX9-NEXT: [[UV34:%[0-9]+]]:_(s32), [[UV35:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE10]](s32), [[UV35]]
     ; GFX9-NEXT: [[SEXT4:%[0-9]+]]:_(s32) = G_SEXT [[ICMP8]](s1)
@@ -1591,8 +1573,8 @@ body: |
     ; GFX9-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[USUBE12]], [[C5]], [[USUBO11]]
     ; GFX9-NEXT: [[UV36:%[0-9]+]]:_(s32), [[UV37:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX9-NEXT: [[UADDO72:%[0-9]+]]:_(s32), [[UADDO73:%[0-9]+]]:_(s1) = G_UADDO [[UADDO70]], [[UV36]]
-    ; GFX9-NEXT: [[UADDE16:%[0-9]+]]:_(s32), [[UADDE17:%[0-9]+]]:_(s1) = G_UADDE [[ADD35]], [[UV37]], [[UADDO73]]
-    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE16]](s32)
+    ; GFX9-NEXT: [[UADDE12:%[0-9]+]]:_(s32), [[UADDE13:%[0-9]+]]:_(s1) = G_UADDE [[ADD33]], [[UV37]], [[UADDO73]]
+    ; GFX9-NEXT: [[MV4:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO72]](s32), [[UADDE12]](s32)
     ; GFX9-NEXT: [[ICMP11:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV35]]
     ; GFX9-NEXT: [[SEXT6:%[0-9]+]]:_(s32) = G_SEXT [[ICMP11]](s1)
     ; GFX9-NEXT: [[ICMP12:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO10]](s32), [[UV34]]
@@ -1601,8 +1583,8 @@ body: |
     ; GFX9-NEXT: [[SELECT5:%[0-9]+]]:_(s32) = G_SELECT [[ICMP13]](s1), [[SEXT7]], [[SEXT6]]
     ; GFX9-NEXT: [[UV38:%[0-9]+]]:_(s32), [[UV39:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C6]](s64)
     ; GFX9-NEXT: [[UADDO74:%[0-9]+]]:_(s32), [[UADDO75:%[0-9]+]]:_(s1) = G_UADDO [[UADDO72]], [[UV38]]
-    ; GFX9-NEXT: [[UADDE18:%[0-9]+]]:_(s32), [[UADDE19:%[0-9]+]]:_(s1) = G_UADDE [[UADDE16]], [[UV39]], [[UADDO75]]
-    ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE18]](s32)
+    ; GFX9-NEXT: [[UADDE14:%[0-9]+]]:_(s32), [[UADDE15:%[0-9]+]]:_(s1) = G_UADDE [[UADDE12]], [[UV39]], [[UADDO75]]
+    ; GFX9-NEXT: [[MV5:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO74]](s32), [[UADDE14]](s32)
     ; GFX9-NEXT: [[ICMP14:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT5]](s32), [[C5]]
     ; GFX9-NEXT: [[SELECT6:%[0-9]+]]:_(s64) = G_SELECT [[ICMP14]](s1), [[MV5]], [[MV4]]
     ; GFX9-NEXT: [[ICMP15:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT4]](s32), [[C5]]
@@ -2178,72 +2160,70 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]]
+    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32)
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32)
     ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
     ; GFX6-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -2257,8 +2237,8 @@ body: |
     ; GFX6-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]]
-    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]]
-    ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32)
+    ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]]
+    ; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32)
     ; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]]
     ; GFX6-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX6-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]]
@@ -2267,8 +2247,8 @@ body: |
     ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]]
-    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]]
-    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32)
+    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]]
+    ; GFX6-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32)
     ; GFX6-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX6-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]]
     ; GFX6-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
@@ -2331,72 +2311,70 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]]
+    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32)
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32)
     ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
     ; GFX8-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -2410,8 +2388,8 @@ body: |
     ; GFX8-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]]
-    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]]
-    ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32)
+    ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]]
+    ; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32)
     ; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]]
     ; GFX8-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX8-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]]
@@ -2420,8 +2398,8 @@ body: |
     ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]]
-    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]]
-    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32)
+    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]]
+    ; GFX8-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32)
     ; GFX8-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX8-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]]
     ; GFX8-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]
@@ -2484,72 +2462,70 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]]
+    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
-    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD16]](s32)
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
+    ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO32]](s32), [[ADD15]](s32)
     ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
     ; GFX9-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[ICMP]](s1)
@@ -2563,8 +2539,8 @@ body: |
     ; GFX9-NEXT: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
     ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[UADDO32]], [[UV14]]
-    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[ADD16]], [[UV15]], [[UADDO35]]
-    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE6]](s32)
+    ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[ADD15]], [[UV15]], [[UADDO35]]
+    ; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO34]](s32), [[UADDE4]](s32)
     ; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE6]](s32), [[UV13]]
     ; GFX9-NEXT: [[SEXT2:%[0-9]+]]:_(s32) = G_SEXT [[ICMP3]](s1)
     ; GFX9-NEXT: [[ICMP4:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBO4]](s32), [[UV12]]
@@ -2573,8 +2549,8 @@ body: |
     ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[ICMP5]](s1), [[SEXT3]], [[SEXT2]]
     ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C7]](s64)
     ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UV16]]
-    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[UADDE6]], [[UV17]], [[UADDO37]]
-    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE8]](s32)
+    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[UV17]], [[UADDO37]]
+    ; GFX9-NEXT: [[MV2:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO36]](s32), [[UADDE6]](s32)
     ; GFX9-NEXT: [[ICMP6:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT1]](s32), [[C6]]
     ; GFX9-NEXT: [[SELECT2:%[0-9]+]]:_(s64) = G_SELECT [[ICMP6]](s1), [[MV2]], [[MV1]]
     ; GFX9-NEXT: [[ICMP7:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[SELECT]](s32), [[C6]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
index 68b3c314bd61..5040c04428e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-urem.mir
@@ -289,71 +289,69 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
@@ -435,71 +433,69 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
@@ -581,71 +577,69 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
@@ -741,71 +735,69 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]]
     ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]]
     ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]]
@@ -857,95 +849,93 @@ body: |
     ; GFX6-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[FPTOUI2]]
     ; GFX6-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]]
     ; GFX6-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]]
-    ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX6-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX6-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX6-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX6-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX6-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX6-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
     ; GFX6-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UMULH16]]
     ; GFX6-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX6-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX6-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX6-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX6-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX6-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX6-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1)
     ; GFX6-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH18]]
     ; GFX6-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1)
-    ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX6-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD21]]
+    ; GFX6-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX6-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD20]]
     ; GFX6-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1)
-    ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX6-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX6-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX6-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX6-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO42]]
-    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO45]]
-    ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO45]]
     ; GFX6-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO44]]
     ; GFX6-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[UADDO44]]
-    ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE6]]
+    ; GFX6-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE4]]
     ; GFX6-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO44]]
-    ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[MUL24]]
-    ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD27]]
+    ; GFX6-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX6-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX6-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL24]]
+    ; GFX6-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD25]]
     ; GFX6-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[MUL24]]
     ; GFX6-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX6-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
     ; GFX6-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH21]]
     ; GFX6-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1)
-    ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[ADD27]]
-    ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[MUL24]]
-    ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD27]]
+    ; GFX6-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX6-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD25]]
+    ; GFX6-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL24]]
+    ; GFX6-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD25]]
     ; GFX6-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX6-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
     ; GFX6-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH23]]
     ; GFX6-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1)
-    ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD28]]
+    ; GFX6-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX6-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD26]]
     ; GFX6-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
-    ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[ADD27]]
-    ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX6-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX6-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD25]]
+    ; GFX6-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX6-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[UADDO54]]
-    ; GFX6-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO45]]
-    ; GFX6-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[C5]], [[UADDO57]]
+    ; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD29]], [[UADDO57]]
     ; GFX6-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX6-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX6-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDO56]]
-    ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE10]]
+    ; GFX6-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDO56]]
     ; GFX6-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX6-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
     ; GFX6-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH25]]
     ; GFX6-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1)
-    ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE10]]
+    ; GFX6-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX6-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE6]]
     ; GFX6-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDO56]]
-    ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE10]]
+    ; GFX6-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE6]]
     ; GFX6-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX6-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
     ; GFX6-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH27]]
     ; GFX6-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1)
-    ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD32]]
+    ; GFX6-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX6-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD30]]
     ; GFX6-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
-    ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE10]]
-    ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
+    ; GFX6-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX6-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE6]]
+    ; GFX6-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
     ; GFX6-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX6-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[UADDO66]]
     ; GFX6-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV29]], [[UADDO66]]
-    ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD35]]
+    ; GFX6-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD33]]
     ; GFX6-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV28]], [[UADDO66]]
-    ; GFX6-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX6-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX6-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX6-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX6-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV24]], [[MUL33]]
-    ; GFX6-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD37]], [[USUBO11]]
-    ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD37]]
+    ; GFX6-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD35]], [[USUBO11]]
+    ; GFX6-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD35]]
     ; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO10]](s32), [[USUBE14]](s32)
     ; GFX6-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX6-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV31]]
@@ -1030,71 +1020,69 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]]
     ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]]
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]]
@@ -1146,95 +1134,93 @@ body: |
     ; GFX8-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[FPTOUI2]]
     ; GFX8-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]]
     ; GFX8-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]]
-    ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX8-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX8-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX8-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX8-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX8-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX8-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
     ; GFX8-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UMULH16]]
     ; GFX8-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX8-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX8-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX8-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX8-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX8-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX8-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1)
     ; GFX8-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH18]]
     ; GFX8-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1)
-    ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX8-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD21]]
+    ; GFX8-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX8-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD20]]
     ; GFX8-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1)
-    ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX8-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX8-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX8-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX8-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO42]]
-    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO45]]
-    ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO45]]
     ; GFX8-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO44]]
     ; GFX8-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[UADDO44]]
-    ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE6]]
+    ; GFX8-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE4]]
     ; GFX8-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO44]]
-    ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[MUL24]]
-    ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD27]]
+    ; GFX8-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX8-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX8-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL24]]
+    ; GFX8-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD25]]
     ; GFX8-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[MUL24]]
     ; GFX8-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX8-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
     ; GFX8-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH21]]
     ; GFX8-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1)
-    ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[ADD27]]
-    ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[MUL24]]
-    ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD27]]
+    ; GFX8-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX8-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD25]]
+    ; GFX8-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL24]]
+    ; GFX8-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD25]]
     ; GFX8-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX8-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
     ; GFX8-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH23]]
     ; GFX8-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1)
-    ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD28]]
+    ; GFX8-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX8-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD26]]
     ; GFX8-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
-    ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[ADD27]]
-    ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX8-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX8-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD25]]
+    ; GFX8-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX8-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[UADDO54]]
-    ; GFX8-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO45]]
-    ; GFX8-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[C5]], [[UADDO57]]
+    ; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD29]], [[UADDO57]]
     ; GFX8-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX8-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX8-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDO56]]
-    ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE10]]
+    ; GFX8-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDO56]]
     ; GFX8-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX8-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
     ; GFX8-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH25]]
     ; GFX8-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1)
-    ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE10]]
+    ; GFX8-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX8-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE6]]
     ; GFX8-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDO56]]
-    ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE10]]
+    ; GFX8-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE6]]
     ; GFX8-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX8-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
     ; GFX8-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH27]]
     ; GFX8-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1)
-    ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD32]]
+    ; GFX8-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX8-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD30]]
     ; GFX8-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
-    ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE10]]
-    ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
+    ; GFX8-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX8-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE6]]
+    ; GFX8-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
     ; GFX8-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX8-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[UADDO66]]
     ; GFX8-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV29]], [[UADDO66]]
-    ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD35]]
+    ; GFX8-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD33]]
     ; GFX8-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV28]], [[UADDO66]]
-    ; GFX8-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX8-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX8-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX8-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX8-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV24]], [[MUL33]]
-    ; GFX8-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD37]], [[USUBO11]]
-    ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD37]]
+    ; GFX8-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD35]], [[USUBO11]]
+    ; GFX8-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD35]]
     ; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO10]](s32), [[USUBE14]](s32)
     ; GFX8-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX8-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV31]]
@@ -1319,71 +1305,69 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C5]], [[UADDO23]]
+    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDO22]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE4]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV12]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDO22]]
     ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE4]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV13]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDO22]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE4]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV12]], [[UADDE2]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE4]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV13]], [[UADDE2]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[UADDO32]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV15]], [[UADDO32]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV14]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV14]], [[UADDO32]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV10]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV11]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV11]], [[ADD17]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV2]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV17]]
@@ -1435,95 +1419,93 @@ body: |
     ; GFX9-NEXT: [[MUL19:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[FPTOUI2]]
     ; GFX9-NEXT: [[MUL20:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[FPTOUI3]]
     ; GFX9-NEXT: [[UMULH15:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[FPTOUI2]]
-    ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
-    ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ADD19]], [[UMULH15]]
+    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[MUL19]], [[MUL20]]
+    ; GFX9-NEXT: [[ADD19:%[0-9]+]]:_(s32) = G_ADD [[ADD18]], [[UMULH15]]
     ; GFX9-NEXT: [[MUL21:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[MUL18]]
-    ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD20]]
+    ; GFX9-NEXT: [[MUL22:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI2]], [[ADD19]]
     ; GFX9-NEXT: [[UMULH16:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[MUL18]]
     ; GFX9-NEXT: [[UADDO34:%[0-9]+]]:_(s32), [[UADDO35:%[0-9]+]]:_(s1) = G_UADDO [[MUL21]], [[MUL22]]
     ; GFX9-NEXT: [[ZEXT15:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO35]](s1)
     ; GFX9-NEXT: [[UADDO36:%[0-9]+]]:_(s32), [[UADDO37:%[0-9]+]]:_(s1) = G_UADDO [[UADDO34]], [[UMULH16]]
     ; GFX9-NEXT: [[ZEXT16:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO37]](s1)
-    ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
-    ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD20]]
+    ; GFX9-NEXT: [[ADD20:%[0-9]+]]:_(s32) = G_ADD [[ZEXT15]], [[ZEXT16]]
+    ; GFX9-NEXT: [[MUL23:%[0-9]+]]:_(s32) = G_MUL [[FPTOUI3]], [[ADD19]]
     ; GFX9-NEXT: [[UMULH17:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[MUL18]]
-    ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD20]]
+    ; GFX9-NEXT: [[UMULH18:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI2]], [[ADD19]]
     ; GFX9-NEXT: [[UADDO38:%[0-9]+]]:_(s32), [[UADDO39:%[0-9]+]]:_(s1) = G_UADDO [[MUL23]], [[UMULH17]]
     ; GFX9-NEXT: [[ZEXT17:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO39]](s1)
     ; GFX9-NEXT: [[UADDO40:%[0-9]+]]:_(s32), [[UADDO41:%[0-9]+]]:_(s1) = G_UADDO [[UADDO38]], [[UMULH18]]
     ; GFX9-NEXT: [[ZEXT18:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO41]](s1)
-    ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
-    ; GFX9-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD21]]
+    ; GFX9-NEXT: [[ADD21:%[0-9]+]]:_(s32) = G_ADD [[ZEXT17]], [[ZEXT18]]
+    ; GFX9-NEXT: [[UADDO42:%[0-9]+]]:_(s32), [[UADDO43:%[0-9]+]]:_(s1) = G_UADDO [[UADDO40]], [[ADD20]]
     ; GFX9-NEXT: [[ZEXT19:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO43]](s1)
-    ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[ADD22]], [[ZEXT19]]
-    ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD20]]
-    ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD23]]
+    ; GFX9-NEXT: [[ADD22:%[0-9]+]]:_(s32) = G_ADD [[ADD21]], [[ZEXT19]]
+    ; GFX9-NEXT: [[UMULH19:%[0-9]+]]:_(s32) = G_UMULH [[FPTOUI3]], [[ADD19]]
+    ; GFX9-NEXT: [[ADD23:%[0-9]+]]:_(s32) = G_ADD [[UMULH19]], [[ADD22]]
     ; GFX9-NEXT: [[UADDO44:%[0-9]+]]:_(s32), [[UADDO45:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI2]], [[UADDO42]]
-    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD24]], [[UADDO45]]
-    ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI3]], [[ADD24]]
+    ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI3]], [[ADD23]], [[UADDO45]]
     ; GFX9-NEXT: [[MUL24:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDO44]]
     ; GFX9-NEXT: [[MUL25:%[0-9]+]]:_(s32) = G_MUL [[USUBE12]], [[UADDO44]]
-    ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE6]]
+    ; GFX9-NEXT: [[MUL26:%[0-9]+]]:_(s32) = G_MUL [[USUBO8]], [[UADDE4]]
     ; GFX9-NEXT: [[UMULH20:%[0-9]+]]:_(s32) = G_UMULH [[USUBO8]], [[UADDO44]]
-    ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
-    ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ADD26]], [[UMULH20]]
-    ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[MUL24]]
-    ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD27]]
+    ; GFX9-NEXT: [[ADD24:%[0-9]+]]:_(s32) = G_ADD [[MUL25]], [[MUL26]]
+    ; GFX9-NEXT: [[ADD25:%[0-9]+]]:_(s32) = G_ADD [[ADD24]], [[UMULH20]]
+    ; GFX9-NEXT: [[MUL27:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[MUL24]]
+    ; GFX9-NEXT: [[MUL28:%[0-9]+]]:_(s32) = G_MUL [[UADDO44]], [[ADD25]]
     ; GFX9-NEXT: [[UMULH21:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[MUL24]]
     ; GFX9-NEXT: [[UADDO46:%[0-9]+]]:_(s32), [[UADDO47:%[0-9]+]]:_(s1) = G_UADDO [[MUL27]], [[MUL28]]
     ; GFX9-NEXT: [[ZEXT20:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO47]](s1)
     ; GFX9-NEXT: [[UADDO48:%[0-9]+]]:_(s32), [[UADDO49:%[0-9]+]]:_(s1) = G_UADDO [[UADDO46]], [[UMULH21]]
     ; GFX9-NEXT: [[ZEXT21:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO49]](s1)
-    ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
-    ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE6]], [[ADD27]]
-    ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[MUL24]]
-    ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD27]]
+    ; GFX9-NEXT: [[ADD26:%[0-9]+]]:_(s32) = G_ADD [[ZEXT20]], [[ZEXT21]]
+    ; GFX9-NEXT: [[MUL29:%[0-9]+]]:_(s32) = G_MUL [[UADDE4]], [[ADD25]]
+    ; GFX9-NEXT: [[UMULH22:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[MUL24]]
+    ; GFX9-NEXT: [[UMULH23:%[0-9]+]]:_(s32) = G_UMULH [[UADDO44]], [[ADD25]]
     ; GFX9-NEXT: [[UADDO50:%[0-9]+]]:_(s32), [[UADDO51:%[0-9]+]]:_(s1) = G_UADDO [[MUL29]], [[UMULH22]]
     ; GFX9-NEXT: [[ZEXT22:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO51]](s1)
     ; GFX9-NEXT: [[UADDO52:%[0-9]+]]:_(s32), [[UADDO53:%[0-9]+]]:_(s1) = G_UADDO [[UADDO50]], [[UMULH23]]
     ; GFX9-NEXT: [[ZEXT23:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO53]](s1)
-    ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
-    ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD28]]
+    ; GFX9-NEXT: [[ADD27:%[0-9]+]]:_(s32) = G_ADD [[ZEXT22]], [[ZEXT23]]
+    ; GFX9-NEXT: [[UADDO54:%[0-9]+]]:_(s32), [[UADDO55:%[0-9]+]]:_(s1) = G_UADDO [[UADDO52]], [[ADD26]]
     ; GFX9-NEXT: [[ZEXT24:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO55]](s1)
-    ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ADD29]], [[ZEXT24]]
-    ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE6]], [[ADD27]]
-    ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD30]]
+    ; GFX9-NEXT: [[ADD28:%[0-9]+]]:_(s32) = G_ADD [[ADD27]], [[ZEXT24]]
+    ; GFX9-NEXT: [[UMULH24:%[0-9]+]]:_(s32) = G_UMULH [[UADDE4]], [[ADD25]]
+    ; GFX9-NEXT: [[ADD29:%[0-9]+]]:_(s32) = G_ADD [[UMULH24]], [[ADD28]]
     ; GFX9-NEXT: [[UADDO56:%[0-9]+]]:_(s32), [[UADDO57:%[0-9]+]]:_(s1) = G_UADDO [[UADDO44]], [[UADDO54]]
-    ; GFX9-NEXT: [[UADDE8:%[0-9]+]]:_(s32), [[UADDE9:%[0-9]+]]:_(s1) = G_UADDE [[ADD25]], [[ADD31]], [[UADDO45]]
-    ; GFX9-NEXT: [[UADDE10:%[0-9]+]]:_(s32), [[UADDE11:%[0-9]+]]:_(s1) = G_UADDE [[UADDE8]], [[C5]], [[UADDO57]]
+    ; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UADDE4]], [[ADD29]], [[UADDO57]]
     ; GFX9-NEXT: [[UV24:%[0-9]+]]:_(s32), [[UV25:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX9-NEXT: [[UV26:%[0-9]+]]:_(s32), [[UV27:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
     ; GFX9-NEXT: [[MUL30:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDO56]]
-    ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE10]]
+    ; GFX9-NEXT: [[MUL31:%[0-9]+]]:_(s32) = G_MUL [[UV26]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH25:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDO56]]
     ; GFX9-NEXT: [[UADDO58:%[0-9]+]]:_(s32), [[UADDO59:%[0-9]+]]:_(s1) = G_UADDO [[MUL30]], [[MUL31]]
     ; GFX9-NEXT: [[ZEXT25:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO59]](s1)
     ; GFX9-NEXT: [[UADDO60:%[0-9]+]]:_(s32), [[UADDO61:%[0-9]+]]:_(s1) = G_UADDO [[UADDO58]], [[UMULH25]]
     ; GFX9-NEXT: [[ZEXT26:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO61]](s1)
-    ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
-    ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE10]]
+    ; GFX9-NEXT: [[ADD30:%[0-9]+]]:_(s32) = G_ADD [[ZEXT25]], [[ZEXT26]]
+    ; GFX9-NEXT: [[MUL32:%[0-9]+]]:_(s32) = G_MUL [[UV27]], [[UADDE6]]
     ; GFX9-NEXT: [[UMULH26:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDO56]]
-    ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE10]]
+    ; GFX9-NEXT: [[UMULH27:%[0-9]+]]:_(s32) = G_UMULH [[UV26]], [[UADDE6]]
     ; GFX9-NEXT: [[UADDO62:%[0-9]+]]:_(s32), [[UADDO63:%[0-9]+]]:_(s1) = G_UADDO [[MUL32]], [[UMULH26]]
     ; GFX9-NEXT: [[ZEXT27:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO63]](s1)
     ; GFX9-NEXT: [[UADDO64:%[0-9]+]]:_(s32), [[UADDO65:%[0-9]+]]:_(s1) = G_UADDO [[UADDO62]], [[UMULH27]]
     ; GFX9-NEXT: [[ZEXT28:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO65]](s1)
-    ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
-    ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD32]]
+    ; GFX9-NEXT: [[ADD31:%[0-9]+]]:_(s32) = G_ADD [[ZEXT27]], [[ZEXT28]]
+    ; GFX9-NEXT: [[UADDO66:%[0-9]+]]:_(s32), [[UADDO67:%[0-9]+]]:_(s1) = G_UADDO [[UADDO64]], [[ADD30]]
     ; GFX9-NEXT: [[ZEXT29:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO67]](s1)
-    ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[ADD33]], [[ZEXT29]]
-    ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE10]]
-    ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD34]]
+    ; GFX9-NEXT: [[ADD32:%[0-9]+]]:_(s32) = G_ADD [[ADD31]], [[ZEXT29]]
+    ; GFX9-NEXT: [[UMULH28:%[0-9]+]]:_(s32) = G_UMULH [[UV27]], [[UADDE6]]
+    ; GFX9-NEXT: [[ADD33:%[0-9]+]]:_(s32) = G_ADD [[UMULH28]], [[ADD32]]
     ; GFX9-NEXT: [[UV28:%[0-9]+]]:_(s32), [[UV29:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX9-NEXT: [[MUL33:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[UADDO66]]
     ; GFX9-NEXT: [[MUL34:%[0-9]+]]:_(s32) = G_MUL [[UV29]], [[UADDO66]]
-    ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD35]]
+    ; GFX9-NEXT: [[MUL35:%[0-9]+]]:_(s32) = G_MUL [[UV28]], [[ADD33]]
     ; GFX9-NEXT: [[UMULH29:%[0-9]+]]:_(s32) = G_UMULH [[UV28]], [[UADDO66]]
-    ; GFX9-NEXT: [[ADD36:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
-    ; GFX9-NEXT: [[ADD37:%[0-9]+]]:_(s32) = G_ADD [[ADD36]], [[UMULH29]]
+    ; GFX9-NEXT: [[ADD34:%[0-9]+]]:_(s32) = G_ADD [[MUL34]], [[MUL35]]
+    ; GFX9-NEXT: [[ADD35:%[0-9]+]]:_(s32) = G_ADD [[ADD34]], [[UMULH29]]
     ; GFX9-NEXT: [[USUBO10:%[0-9]+]]:_(s32), [[USUBO11:%[0-9]+]]:_(s1) = G_USUBO [[UV24]], [[MUL33]]
-    ; GFX9-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD37]], [[USUBO11]]
-    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD37]]
+    ; GFX9-NEXT: [[USUBE14:%[0-9]+]]:_(s32), [[USUBE15:%[0-9]+]]:_(s1) = G_USUBE [[UV25]], [[ADD35]], [[USUBO11]]
+    ; GFX9-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[UV25]], [[ADD35]]
     ; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO10]](s32), [[USUBE14]](s32)
     ; GFX9-NEXT: [[UV30:%[0-9]+]]:_(s32), [[UV31:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
     ; GFX9-NEXT: [[ICMP8:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE14]](s32), [[UV31]]
@@ -2079,71 +2061,69 @@ body: |
     ; GFX6-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX6-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX6-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX6-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX6-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX6-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX6-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX6-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX6-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX6-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX6-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX6-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX6-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX6-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX6-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX6-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX6-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX6-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX6-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX6-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX6-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX6-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX6-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX6-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX6-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX6-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX6-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX6-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX6-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]]
+    ; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX6-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX6-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX6-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX6-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX6-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX6-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX6-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX6-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX6-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX6-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX6-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX6-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX6-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX6-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX6-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX6-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX6-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX6-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX6-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX6-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX6-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX6-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX6-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX6-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX6-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX6-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX6-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX6-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX6-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX6-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
@@ -2228,71 +2208,69 @@ body: |
     ; GFX8-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX8-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX8-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX8-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX8-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX8-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX8-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX8-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX8-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX8-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX8-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX8-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX8-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX8-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX8-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX8-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX8-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX8-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX8-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX8-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX8-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX8-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX8-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX8-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX8-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX8-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX8-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX8-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]]
+    ; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX8-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX8-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX8-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX8-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX8-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX8-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX8-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX8-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX8-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX8-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX8-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX8-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX8-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX8-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX8-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX8-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX8-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX8-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX8-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX8-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX8-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX8-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX8-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX8-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX8-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX8-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX8-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX8-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX8-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX8-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]
@@ -2377,71 +2355,69 @@ body: |
     ; GFX9-NEXT: [[ADD5:%[0-9]+]]:_(s32) = G_ADD [[UMULH4]], [[ADD4]]
     ; GFX9-NEXT: [[UADDO10:%[0-9]+]]:_(s32), [[UADDO11:%[0-9]+]]:_(s1) = G_UADDO [[FPTOUI]], [[UADDO8]]
     ; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[FPTOUI1]], [[ADD5]], [[UADDO11]]
-    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[FPTOUI1]], [[ADD5]]
     ; GFX9-NEXT: [[MUL6:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL7:%[0-9]+]]:_(s32) = G_MUL [[USUBE]], [[UADDO10]]
     ; GFX9-NEXT: [[MUL8:%[0-9]+]]:_(s32) = G_MUL [[USUBO]], [[UADDE]]
     ; GFX9-NEXT: [[UMULH5:%[0-9]+]]:_(s32) = G_UMULH [[USUBO]], [[UADDO10]]
-    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
-    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ADD7]], [[UMULH5]]
+    ; GFX9-NEXT: [[ADD6:%[0-9]+]]:_(s32) = G_ADD [[MUL7]], [[MUL8]]
+    ; GFX9-NEXT: [[ADD7:%[0-9]+]]:_(s32) = G_ADD [[ADD6]], [[UMULH5]]
     ; GFX9-NEXT: [[MUL9:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[MUL10:%[0-9]+]]:_(s32) = G_MUL [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH6:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[MUL6]]
     ; GFX9-NEXT: [[UADDO12:%[0-9]+]]:_(s32), [[UADDO13:%[0-9]+]]:_(s1) = G_UADDO [[MUL9]], [[MUL10]]
     ; GFX9-NEXT: [[ZEXT5:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO13]](s1)
     ; GFX9-NEXT: [[UADDO14:%[0-9]+]]:_(s32), [[UADDO15:%[0-9]+]]:_(s1) = G_UADDO [[UADDO12]], [[UMULH6]]
     ; GFX9-NEXT: [[ZEXT6:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO15]](s1)
-    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
-    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD8]]
+    ; GFX9-NEXT: [[ADD8:%[0-9]+]]:_(s32) = G_ADD [[ZEXT5]], [[ZEXT6]]
+    ; GFX9-NEXT: [[MUL11:%[0-9]+]]:_(s32) = G_MUL [[UADDE]], [[ADD7]]
     ; GFX9-NEXT: [[UMULH7:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[MUL6]]
-    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD8]]
+    ; GFX9-NEXT: [[UMULH8:%[0-9]+]]:_(s32) = G_UMULH [[UADDO10]], [[ADD7]]
     ; GFX9-NEXT: [[UADDO16:%[0-9]+]]:_(s32), [[UADDO17:%[0-9]+]]:_(s1) = G_UADDO [[MUL11]], [[UMULH7]]
     ; GFX9-NEXT: [[ZEXT7:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO17]](s1)
     ; GFX9-NEXT: [[UADDO18:%[0-9]+]]:_(s32), [[UADDO19:%[0-9]+]]:_(s1) = G_UADDO [[UADDO16]], [[UMULH8]]
     ; GFX9-NEXT: [[ZEXT8:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO19]](s1)
-    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
-    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD9]]
+    ; GFX9-NEXT: [[ADD9:%[0-9]+]]:_(s32) = G_ADD [[ZEXT7]], [[ZEXT8]]
+    ; GFX9-NEXT: [[UADDO20:%[0-9]+]]:_(s32), [[UADDO21:%[0-9]+]]:_(s1) = G_UADDO [[UADDO18]], [[ADD8]]
     ; GFX9-NEXT: [[ZEXT9:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO21]](s1)
-    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[ADD10]], [[ZEXT9]]
-    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD8]]
-    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD11]]
+    ; GFX9-NEXT: [[ADD10:%[0-9]+]]:_(s32) = G_ADD [[ADD9]], [[ZEXT9]]
+    ; GFX9-NEXT: [[UMULH9:%[0-9]+]]:_(s32) = G_UMULH [[UADDE]], [[ADD7]]
+    ; GFX9-NEXT: [[ADD11:%[0-9]+]]:_(s32) = G_ADD [[UMULH9]], [[ADD10]]
     ; GFX9-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX9-NEXT: [[UADDO22:%[0-9]+]]:_(s32), [[UADDO23:%[0-9]+]]:_(s1) = G_UADDO [[UADDO10]], [[UADDO20]]
-    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[ADD6]], [[ADD12]], [[UADDO11]]
-    ; GFX9-NEXT: [[UADDE4:%[0-9]+]]:_(s32), [[UADDE5:%[0-9]+]]:_(s1) = G_UADDE [[UADDE2]], [[C6]], [[UADDO23]]
+    ; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UADDE]], [[ADD11]], [[UADDO23]]
     ; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND]](s64)
     ; GFX9-NEXT: [[MUL12:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDO22]]
-    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE4]]
+    ; GFX9-NEXT: [[MUL13:%[0-9]+]]:_(s32) = G_MUL [[UV8]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH10:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDO22]]
     ; GFX9-NEXT: [[UADDO24:%[0-9]+]]:_(s32), [[UADDO25:%[0-9]+]]:_(s1) = G_UADDO [[MUL12]], [[MUL13]]
     ; GFX9-NEXT: [[ZEXT10:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO25]](s1)
     ; GFX9-NEXT: [[UADDO26:%[0-9]+]]:_(s32), [[UADDO27:%[0-9]+]]:_(s1) = G_UADDO [[UADDO24]], [[UMULH10]]
     ; GFX9-NEXT: [[ZEXT11:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO27]](s1)
-    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
-    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE4]]
+    ; GFX9-NEXT: [[ADD12:%[0-9]+]]:_(s32) = G_ADD [[ZEXT10]], [[ZEXT11]]
+    ; GFX9-NEXT: [[MUL14:%[0-9]+]]:_(s32) = G_MUL [[UV9]], [[UADDE2]]
     ; GFX9-NEXT: [[UMULH11:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDO22]]
-    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE4]]
+    ; GFX9-NEXT: [[UMULH12:%[0-9]+]]:_(s32) = G_UMULH [[UV8]], [[UADDE2]]
     ; GFX9-NEXT: [[UADDO28:%[0-9]+]]:_(s32), [[UADDO29:%[0-9]+]]:_(s1) = G_UADDO [[MUL14]], [[UMULH11]]
     ; GFX9-NEXT: [[ZEXT12:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO29]](s1)
     ; GFX9-NEXT: [[UADDO30:%[0-9]+]]:_(s32), [[UADDO31:%[0-9]+]]:_(s1) = G_UADDO [[UADDO28]], [[UMULH12]]
     ; GFX9-NEXT: [[ZEXT13:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO31]](s1)
-    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
-    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD13]]
+    ; GFX9-NEXT: [[ADD13:%[0-9]+]]:_(s32) = G_ADD [[ZEXT12]], [[ZEXT13]]
+    ; GFX9-NEXT: [[UADDO32:%[0-9]+]]:_(s32), [[UADDO33:%[0-9]+]]:_(s1) = G_UADDO [[UADDO30]], [[ADD12]]
     ; GFX9-NEXT: [[ZEXT14:%[0-9]+]]:_(s32) = G_ZEXT [[UADDO33]](s1)
-    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[ADD14]], [[ZEXT14]]
-    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE4]]
-    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD15]]
+    ; GFX9-NEXT: [[ADD14:%[0-9]+]]:_(s32) = G_ADD [[ADD13]], [[ZEXT14]]
+    ; GFX9-NEXT: [[UMULH13:%[0-9]+]]:_(s32) = G_UMULH [[UV9]], [[UADDE2]]
+    ; GFX9-NEXT: [[ADD15:%[0-9]+]]:_(s32) = G_ADD [[UMULH13]], [[ADD14]]
     ; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX9-NEXT: [[MUL15:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[UADDO32]]
     ; GFX9-NEXT: [[MUL16:%[0-9]+]]:_(s32) = G_MUL [[UV11]], [[UADDO32]]
-    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD16]]
+    ; GFX9-NEXT: [[MUL17:%[0-9]+]]:_(s32) = G_MUL [[UV10]], [[ADD15]]
     ; GFX9-NEXT: [[UMULH14:%[0-9]+]]:_(s32) = G_UMULH [[UV10]], [[UADDO32]]
-    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
-    ; GFX9-NEXT: [[ADD18:%[0-9]+]]:_(s32) = G_ADD [[ADD17]], [[UMULH14]]
+    ; GFX9-NEXT: [[ADD16:%[0-9]+]]:_(s32) = G_ADD [[MUL16]], [[MUL17]]
+    ; GFX9-NEXT: [[ADD17:%[0-9]+]]:_(s32) = G_ADD [[ADD16]], [[UMULH14]]
     ; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV6]], [[MUL15]]
-    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD18]], [[USUBO3]]
-    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD18]]
+    ; GFX9-NEXT: [[USUBE2:%[0-9]+]]:_(s32), [[USUBE3:%[0-9]+]]:_(s1) = G_USUBE [[UV7]], [[ADD17]], [[USUBO3]]
+    ; GFX9-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[UV7]], [[ADD17]]
     ; GFX9-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[USUBO2]](s32), [[USUBE2]](s32)
     ; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[AND1]](s64)
     ; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[USUBE2]](s32), [[UV13]]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index ecbc31ee2d8d..c1c967d534d2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -68,38 +68,36 @@ define i64 @v_sdiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
-; CHECK-NEXT:    v_addc_u32_e64 v11, s[4:5], v6, v10, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v10, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v9, v3
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v11
-; CHECK-NEXT:    v_mul_lo_u32 v13, v8, v3
+; CHECK-NEXT:    v_mul_lo_u32 v10, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v11, v8, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v8, v3
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_mul_lo_u32 v9, v11, v13
-; CHECK-NEXT:    v_mul_lo_u32 v12, v3, v8
-; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v13
-; CHECK-NEXT:    v_mul_hi_u32 v13, v11, v13
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v10, v11, v8
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CHECK-NEXT:    v_mul_hi_u32 v12, v3, v8
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v8
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v11
+; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v8
+; CHECK-NEXT:    v_mul_hi_u32 v12, v3, v11
+; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v11
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v12, v6, v8
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v8
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v8
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v10, v4, v3
@@ -225,24 +223,53 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; CHECK-NEXT:    s_mov_b32 s7, s6
 ; CHECK-NEXT:    s_xor_b64 s[12:13], s[0:1], s[6:7]
-; CHECK-NEXT:    s_sub_u32 s3, 0, s10
+; CHECK-NEXT:    s_sub_u32 s0, 0, s10
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, s0, 1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s1, s1, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT:    s_subb_u32 s5, 0, s11
+; CHECK-NEXT:    s_subb_u32 s1, 0, s11
+; CHECK-NEXT:    v_mul_lo_u32 v3, s0, v1
+; CHECK-NEXT:    v_mul_lo_u32 v2, s1, v0
+; CHECK-NEXT:    v_mul_hi_u32 v5, s0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v4, s0, v0
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_mul_lo_u32 v3, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
+; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v5, v0, v2
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, s1, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, s0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v5, s0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v4, s0, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, s11
-; CHECK-NEXT:    v_mul_lo_u32 v3, s3, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, s5, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s3, v0
-; CHECK-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v1, v4
@@ -267,38 +294,7 @@ define amdgpu_ps i64 @s_sdiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v2, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, s5, v0
-; CHECK-NEXT:    v_mul_lo_u32 v5, s3, v3
-; CHECK-NEXT:    v_mul_hi_u32 v8, s3, v0
-; CHECK-NEXT:    v_mul_lo_u32 v7, s3, v0
-; CHECK-NEXT:    v_add_i32_e64 v1, s[0:1], v1, v2
-; CHECK-NEXT:    v_add_i32_e64 v4, s[0:1], v4, v5
-; CHECK-NEXT:    v_add_i32_e64 v4, s[0:1], v4, v8
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CHECK-NEXT:    v_add_i32_e64 v5, s[0:1], v5, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; CHECK-NEXT:    v_add_i32_e64 v2, s[0:1], v5, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v4
-; CHECK-NEXT:    v_add_i32_e64 v2, s[0:1], v8, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v4
-; CHECK-NEXT:    v_add_i32_e64 v5, s[0:1], v5, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; CHECK-NEXT:    v_add_i32_e64 v5, s[0:1], v5, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[0:1], v7, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v4
-; CHECK-NEXT:    v_add_i32_e64 v2, s[0:1], v5, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; CHECK-NEXT:    v_add_i32_e64 v4, s[0:1], v7, v5
-; CHECK-NEXT:    v_add_i32_e64 v3, s[0:1], v3, v4
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s13, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s12, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v5, s12, v0
@@ -453,38 +449,36 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; GISEL-NEXT:    v_addc_u32_e64 v15, s[4:5], v10, v14, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v14, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v13, v9
-; GISEL-NEXT:    v_mul_lo_u32 v16, v12, v15
-; GISEL-NEXT:    v_mul_lo_u32 v17, v12, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v10
+; GISEL-NEXT:    v_mul_lo_u32 v15, v12, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v12, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v14
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; GISEL-NEXT:    v_mul_lo_u32 v13, v15, v17
-; GISEL-NEXT:    v_mul_lo_u32 v16, v9, v12
-; GISEL-NEXT:    v_mul_hi_u32 v14, v9, v17
-; GISEL-NEXT:    v_mul_hi_u32 v17, v15, v17
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v14, v15, v12
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
-; GISEL-NEXT:    v_mul_hi_u32 v16, v9, v12
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v16
-; GISEL-NEXT:    v_mul_hi_u32 v12, v15, v12
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v15
+; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v16, v9, v15
+; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v15
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v16, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v14, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v1, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v9
@@ -591,40 +585,38 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v9, v13, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v14
-; GISEL-NEXT:    v_mul_lo_u32 v16, v11, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v11, v8
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v16
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v16
-; GISEL-NEXT:    v_mul_hi_u32 v16, v14, v16
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v10
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
-; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v9, v14
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v11
+; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v9, v14
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v15, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v9
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
@@ -754,38 +746,36 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v15
-; CGP-NEXT:    v_addc_u32_e64 v15, s[4:5], v11, v14, vcc
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v11, v14, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v15
-; CGP-NEXT:    v_mul_lo_u32 v17, v12, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v12, v11
+; CGP-NEXT:    v_mul_lo_u32 v15, v12, v3
 ; CGP-NEXT:    v_mul_hi_u32 v12, v12, v3
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v15, v17
-; CGP-NEXT:    v_mul_lo_u32 v16, v3, v12
-; CGP-NEXT:    v_mul_hi_u32 v14, v3, v17
-; CGP-NEXT:    v_mul_hi_u32 v17, v15, v17
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v14, v15, v12
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v3, v12
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v16
-; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v11, v12, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v11, v15
+; CGP-NEXT:    v_mul_lo_u32 v14, v3, v12
+; CGP-NEXT:    v_mul_hi_u32 v16, v3, v15
+; CGP-NEXT:    v_mul_hi_u32 v15, v11, v15
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v16, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v3, v12
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_mul_hi_u32 v12, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v11, v12, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v10, v3
 ; CGP-NEXT:    v_mul_lo_u32 v13, v4, v11
 ; CGP-NEXT:    v_mul_hi_u32 v14, v4, v3
@@ -933,38 +923,36 @@ define <2 x i64> @v_sdiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v9, v12, vcc
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v12, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v11, v5
-; CGP-NEXT:    v_mul_lo_u32 v14, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v5
+; CGP-NEXT:    v_mul_lo_u32 v12, v10, v9
+; CGP-NEXT:    v_mul_lo_u32 v13, v10, v5
 ; CGP-NEXT:    v_mul_hi_u32 v10, v10, v5
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v13, v15
-; CGP-NEXT:    v_mul_lo_u32 v14, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v12, v5, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v13, v15
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v12, v13, v10
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v5, v10
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, v13
+; CGP-NEXT:    v_mul_lo_u32 v12, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v14, v5, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v10
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v10, v9, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v5
 ; CGP-NEXT:    v_mul_lo_u32 v11, v6, v9
 ; CGP-NEXT:    v_mul_hi_u32 v12, v6, v5
@@ -1063,32 +1051,62 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_sdiv_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x1000
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
-; CHECK-NEXT:    s_movk_i32 s6, 0xf000
-; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
+; CHECK-NEXT:    s_movk_i32 s4, 0xf000
+; CHECK-NEXT:    s_movk_i32 s6, 0x1000
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
 ; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -1096,7 +1114,7 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1107,44 +1125,12 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_addc_u32_e64 v6, s[4:5], v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v3
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v10
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v9
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CHECK-NEXT:    s_movk_i32 s6, 0x1000
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v7
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
@@ -1152,20 +1138,20 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v3
+; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
+; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
@@ -1178,7 +1164,7 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v7, s7
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v4, vcc
 ; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
@@ -1193,12 +1179,12 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 4096
   ret i64 %result
@@ -1219,13 +1205,13 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
-; GISEL-NEXT:    s_sub_u32 s11, 0, s8
-; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
-; GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GISEL-NEXT:    s_sub_u32 s4, 0, s8
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s12, 0, s9
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_subb_u32 s5, 0, s9
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
@@ -1234,10 +1220,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
@@ -1264,39 +1250,37 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v5
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v5
@@ -1361,13 +1345,13 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
-; GISEL-NEXT:    s_sub_u32 s8, 0, s6
-; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
-; GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GISEL-NEXT:    s_sub_u32 s4, 0, s6
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s9, 0, s7
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_subb_u32 s5, 0, s7
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
@@ -1377,12 +1361,12 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1410,39 +1394,37 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v5
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
@@ -1511,32 +1493,31 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x1000
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
 ; CGP-NEXT:    s_movk_i32 s6, 0xf000
+; CGP-NEXT:    s_movk_i32 s7, 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v4, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v7, v5
-; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
-; CGP-NEXT:    s_movk_i32 s7, 0x1000
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
 ; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
@@ -1555,50 +1536,49 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT:    v_addc_u32_e64 v10, s[4:5], v8, v9, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, -1, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, s6, v7
-; CGP-NEXT:    v_mul_lo_u32 v13, s6, v7
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v14, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v12
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v1, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
-; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_mul_hi_u32 v12, v7, v9
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v1, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v1, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_mul_hi_u32 v10, v0, v8
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
@@ -1681,40 +1661,38 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v7, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, s6, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, s6, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v5
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v11, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v8, v5, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v8, -1, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
+; CGP-NEXT:    v_mul_hi_u32 v11, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v5
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v10
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v7, v10
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
@@ -1786,32 +1764,62 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_sdiv_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
-; CHECK-NEXT:    s_mov_b32 s6, 0xffed2705
-; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0xffed2705
+; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
 ; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -1819,7 +1827,7 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1830,44 +1838,12 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_addc_u32_e64 v6, s[4:5], v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v3
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v10
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v9
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v7
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
@@ -1875,20 +1851,20 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v3
+; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
+; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
@@ -1901,7 +1877,7 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v7, s7
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v6
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, 1, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[4:5]
 ; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, 0, v4, vcc
 ; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
@@ -1916,12 +1892,12 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 1235195
   ret i64 %result
@@ -1942,13 +1918,13 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
-; GISEL-NEXT:    s_sub_u32 s11, 0, s8
-; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
-; GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GISEL-NEXT:    s_sub_u32 s4, 0, s8
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s12, 0, s9
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_subb_u32 s5, 0, s9
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
@@ -1957,10 +1933,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
@@ -1987,39 +1963,37 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v5
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v5
@@ -2084,13 +2058,13 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
-; GISEL-NEXT:    s_sub_u32 s8, 0, s6
-; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
-; GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GISEL-NEXT:    s_sub_u32 s4, 0, s6
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s9, 0, s7
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_subb_u32 s5, 0, s7
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
@@ -2100,12 +2074,12 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -2133,39 +2107,37 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v5
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
@@ -2234,32 +2206,62 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
 ; CGP-NEXT:    s_mov_b32 s6, 0xffed2705
+; CGP-NEXT:    s_mov_b32 s7, 0x12d8fb
+; CGP-NEXT:    v_mov_b32_e32 v4, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v7, v5
-; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_mul_hi_u32 v12, v7, v9
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
-; CGP-NEXT:    s_mov_b32 s7, 0x12d8fb
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
 ; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
@@ -2278,39 +2280,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT:    v_addc_u32_e64 v10, s[4:5], v8, v9, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, -1, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, s6, v7
-; CGP-NEXT:    v_mul_lo_u32 v13, s6, v7
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v14, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v12
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, v1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
@@ -2404,40 +2374,38 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v7, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, s6, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, s6, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v5
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v11, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v8, v5, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v8, -1, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
+; CGP-NEXT:    v_mul_hi_u32 v11, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v5
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v10
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v7, v10
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
@@ -2571,38 +2539,36 @@ define i64 @v_sdiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
-; CHECK-NEXT:    v_addc_u32_e64 v11, s[4:5], v6, v10, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v10, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v9, v5
-; CHECK-NEXT:    v_mul_lo_u32 v12, v8, v11
-; CHECK-NEXT:    v_mul_lo_u32 v13, v8, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v8, v6
+; CHECK-NEXT:    v_mul_lo_u32 v11, v8, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v8, v5
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_mul_lo_u32 v9, v11, v13
-; CHECK-NEXT:    v_mul_lo_u32 v12, v5, v8
-; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v13
-; CHECK-NEXT:    v_mul_hi_u32 v13, v11, v13
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v10, v11, v8
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CHECK-NEXT:    v_mul_hi_u32 v12, v5, v8
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CHECK-NEXT:    v_mul_hi_u32 v8, v11, v8
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v11
+; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v8
+; CHECK-NEXT:    v_mul_hi_u32 v12, v5, v11
+; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v11
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v12, v6, v8
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v8
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v8
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v3, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v5
@@ -2702,8 +2668,8 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b64 s[6:7], 0x1000
-; GISEL-NEXT:    v_lshl_b64 v[7:8], s[6:7], v4
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0x1000
+; GISEL-NEXT:    v_lshl_b64 v[7:8], s[4:5], v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v10, 31, v1
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v4
@@ -2716,123 +2682,121 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v10, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v8, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v8
-; GISEL-NEXT:    v_xor_b32_e32 v9, v0, v10
 ; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v5
-; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v8
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v0
-; GISEL-NEXT:    v_xor_b32_e32 v17, v1, v10
+; GISEL-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
+; GISEL-NEXT:    v_mac_f32_e32 v8, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GISEL-NEXT:    v_xor_b32_e32 v17, v0, v10
+; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v8
+; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v9
+; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v8
+; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v8
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v15
-; GISEL-NEXT:    v_mul_lo_u32 v16, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v1, v0, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v15
+; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v15
+; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v13
+; GISEL-NEXT:    v_mul_hi_u32 v0, v8, v15
+; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v15
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v14, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v13
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v16, v1
-; GISEL-NEXT:    v_mul_hi_u32 v16, v0, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v14, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_mul_hi_u32 v16, v8, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v14, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT:    v_addc_u32_e64 v1, s[4:5], v8, v13, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v1
-; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v9, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v11, v0
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v1, v15
-; GISEL-NEXT:    v_mul_lo_u32 v14, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v1, v15
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v11
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v1, v1, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v13
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v11
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v12
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v17, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT:    v_lshl_b64 v[0:1], s[6:7], v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v17, v8
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_xor_b32_e32 v14, v1, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v13
+; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v1, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v9
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v8, v9
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v1
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v17, v8
+; GISEL-NEXT:    v_lshl_b64 v[0:1], s[4:5], v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v17, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v11
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v8
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v14, v5, v6
-; GISEL-NEXT:    v_mul_lo_u32 v13, v5, v6
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], v17, v11, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], v17, v11
+; GISEL-NEXT:    v_mul_hi_u32 v8, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v6
+; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v8
+; GISEL-NEXT:    v_mul_hi_u32 v13, v5, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v6
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v17, v12
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], v14, v9, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v9, s[4:5], v14, v9
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v7
-; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, v11, v7, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, v9, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
-; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v9, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v5
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v11, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v7
-; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v11, vcc
+; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, v13, v14, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v6
 ; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v9, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v7
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v9, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v13
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v14, vcc
@@ -2858,133 +2822,131 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v10
 ; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v10, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT:    v_xor_b32_e32 v3, v1, v10
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v1, v1
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GISEL-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v3, v3
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, 0, v8
 ; GISEL-NEXT:    v_subb_u32_e32 v12, vcc, 0, v9, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v1
+; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v11, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v0
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
+; GISEL-NEXT:    v_xor_b32_e32 v17, v1, v10
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v15
+; GISEL-NEXT:    v_mul_lo_u32 v14, v3, v15
 ; GISEL-NEXT:    v_mul_lo_u32 v16, v0, v13
-; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v1, v15
-; GISEL-NEXT:    v_xor_b32_e32 v4, v10, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v0, v15
+; GISEL-NEXT:    v_mul_hi_u32 v15, v3, v15
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v10
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v17, v1, v13
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v14, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v14, v3, v13
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v16, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v16, v0, v13
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; GISEL-NEXT:    v_mul_hi_u32 v13, v1, v13
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT:    v_mul_hi_u32 v13, v3, v13
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v14, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v1, v13, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v14
-; GISEL-NEXT:    v_mul_lo_u32 v16, v11, v0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v3, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v1
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v11, v0
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v13
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v16
-; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v16
-; GISEL-NEXT:    v_mul_hi_u32 v16, v14, v16
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
-; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v11
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
-; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v11, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v12
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v2, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v3, v12
+; GISEL-NEXT:    v_xor_b32_e32 v4, v10, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v13
+; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v1, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v3
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v3
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v11
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v17, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v5, v7
-; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v11
+; GISEL-NEXT:    v_mul_hi_u32 v5, v17, v11
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v6, v7, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v2, v12
+; GISEL-NEXT:    v_mul_lo_u32 v6, v2, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v11
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v7, v3, v12
+; GISEL-NEXT:    v_mul_hi_u32 v7, v17, v3
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v12
+; GISEL-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v12
-; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v2, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_mul_lo_u32 v6, v9, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v3
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v17, v11
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v2, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v6
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v9
 ; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v8
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v8
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v11, v9
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s[4:5]
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v13, v3, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v13, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v12, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v12, v8, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v11, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v12, v8, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -3059,38 +3021,36 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v15
-; CGP-NEXT:    v_addc_u32_e64 v15, s[4:5], v9, v14, vcc
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v13, v3
-; CGP-NEXT:    v_mul_lo_u32 v16, v12, v15
-; CGP-NEXT:    v_mul_lo_u32 v17, v12, v3
+; CGP-NEXT:    v_mul_lo_u32 v14, v12, v9
+; CGP-NEXT:    v_mul_lo_u32 v15, v12, v3
 ; CGP-NEXT:    v_mul_hi_u32 v12, v12, v3
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v14
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v15, v17
-; CGP-NEXT:    v_mul_lo_u32 v16, v3, v12
-; CGP-NEXT:    v_mul_hi_u32 v14, v3, v17
-; CGP-NEXT:    v_mul_hi_u32 v17, v15, v17
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v14, v15, v12
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v3, v12
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v17, v16
-; CGP-NEXT:    v_mul_hi_u32 v12, v15, v12
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v12, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v15
+; CGP-NEXT:    v_mul_lo_u32 v14, v3, v12
+; CGP-NEXT:    v_mul_hi_u32 v16, v3, v15
+; CGP-NEXT:    v_mul_hi_u32 v15, v9, v15
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v16, v9, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v3, v12
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v13
-; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v12, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v8, v3
 ; CGP-NEXT:    v_mul_lo_u32 v13, v4, v9
 ; CGP-NEXT:    v_mul_hi_u32 v14, v4, v3
@@ -3238,38 +3198,36 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v8, v12, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v11, v6
-; CGP-NEXT:    v_mul_lo_u32 v14, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v15, v10, v6
+; CGP-NEXT:    v_mul_lo_u32 v12, v10, v8
+; CGP-NEXT:    v_mul_lo_u32 v13, v10, v6
 ; CGP-NEXT:    v_mul_hi_u32 v10, v10, v6
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_mul_lo_u32 v11, v13, v15
-; CGP-NEXT:    v_mul_lo_u32 v14, v6, v10
-; CGP-NEXT:    v_mul_hi_u32 v12, v6, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v13, v15
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v12, v13, v10
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; CGP-NEXT:    v_mul_hi_u32 v14, v6, v10
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v10, v13, v10
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v8, v13
+; CGP-NEXT:    v_mul_lo_u32 v12, v6, v10
+; CGP-NEXT:    v_mul_hi_u32 v14, v6, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v14, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v12, v6, v10
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v10, v7, v6
 ; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, v5, v6
@@ -3429,90 +3387,88 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v1
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
-; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
+; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_and_b32_e32 v5, s6, v0
-; GISEL-NEXT:    v_and_b32_e32 v0, s6, v2
 ; GISEL-NEXT:    v_and_b32_e32 v6, s6, v6
-; GISEL-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v4
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
-; GISEL-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v0
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v2
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v2
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v2
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v11
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0, v5
-; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v9
+; GISEL-NEXT:    v_mul_lo_u32 v0, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v12
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v10
+; GISEL-NEXT:    v_and_b32_e32 v0, s6, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, v7, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v4, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v2
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v2
-; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v2
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v12
-; GISEL-NEXT:    v_mul_lo_u32 v11, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v7
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v4, v10, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v2
+; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v4
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v2
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v10
+; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v10
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0, v5
+; GISEL-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v11, v2
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v2
-; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, v11, v2
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v4
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
@@ -3520,7 +3476,7 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -3532,8 +3488,8 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v13, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v7, s[4:5], v13, v7
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v11, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v7, s[4:5], v11, v7
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v3
 ; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
@@ -3551,147 +3507,145 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v9
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v10, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 0, v6
-; GISEL-NEXT:    v_addc_u32_e64 v7, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0, v6
+; GISEL-NEXT:    v_addc_u32_e64 v5, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, v3
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v9
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v7
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v5, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v12
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v9, v11, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v10, v12, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v5
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
-; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
-; GISEL-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, 0, v6
-; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v3
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v3
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v3
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v3, v9
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 0, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v3, v11
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v6
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v2
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], 0, v3
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, v5, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v2
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v2
+; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v2
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v2, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v2, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v2, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v12, v3, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v10, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v2
+; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v2
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v10
+; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v2, v10
+; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
-; GISEL-NEXT:    v_addc_u32_e64 v3, s[4:5], v4, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v0
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v3
-; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v0
-; GISEL-NEXT:    v_mul_hi_u32 v5, v5, v0
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v11
-; GISEL-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v9, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v3, v11
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v5
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v5
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v3, v3, v5
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v9
-; GISEL-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v5
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v0, v8
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v3
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v0
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v6
 ; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v2, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v2
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v14, v3
-; GISEL-NEXT:    v_mul_hi_u32 v4, v14, v4
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v3
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v6
+; GISEL-NEXT:    v_mul_hi_u32 v2, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_mul_hi_u32 v3, v14, v3
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v6, v12, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v4, v7, v2
-; GISEL-NEXT:    v_mul_lo_u32 v5, v6, v3
-; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v2
-; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v2
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v13, v8
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v14, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], v14, v4
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v7
-; GISEL-NEXT:    v_subb_u32_e32 v4, vcc, v4, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT:    v_mul_lo_u32 v6, v5, v2
+; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v4
+; GISEL-NEXT:    v_mul_hi_u32 v9, v3, v2
+; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v2
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v11, v8
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v12, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v12, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
+; GISEL-NEXT:    v_subb_u32_e32 v6, vcc, v6, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v6
-; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v7, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v7
-; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, v9, v10, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v2
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v7
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v11, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v9
 ; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v10, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v10, v6, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 0, v2
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index ee7f2a8caa74..9f6ae0824dbf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -168,9 +168,9 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_sub_u32 s14, 0, s8
-; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
+; GFX8-NEXT:    s_sub_u32 s0, 0, s8
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
@@ -178,17 +178,46 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_subb_u32 s15, 0, s9
-; GFX8-NEXT:    v_mul_lo_u32 v2, s14, v1
-; GFX8-NEXT:    v_mul_lo_u32 v3, s15, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s14, v0
-; GFX8-NEXT:    v_mul_lo_u32 v4, s14, v0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s9
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_subb_u32 s1, 0, s9
+; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v6, v0, v4
+; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, v1, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
+; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s9
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
@@ -209,38 +238,7 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v2, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v4, s15, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, s14, v3
-; GFX8-NEXT:    v_mul_hi_u32 v8, s14, v0
-; GFX8-NEXT:    v_mul_lo_u32 v7, s14, v0
-; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v2
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v5
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v8
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v7
-; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v7
-; GFX8-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
-; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v7, v5
-; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v4
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s10, v0
@@ -297,15 +295,16 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
@@ -348,9 +347,9 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_u32 s14, 0, s8
-; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
+; GFX9-NEXT:    s_sub_u32 s0, 0, s8
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -358,12 +357,12 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_subb_u32 s15, 0, s9
-; GFX9-NEXT:    v_mul_lo_u32 v2, s14, v1
-; GFX9-NEXT:    v_mul_lo_u32 v3, s15, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s14, v0
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_subb_u32 s1, 0, s9
+; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s11
 ; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
@@ -387,36 +386,34 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s15, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s14, v3
-; GFX9-NEXT:    v_mul_hi_u32 v6, s14, v0
-; GFX9-NEXT:    v_mul_lo_u32 v7, s14, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    v_add3_u32 v4, v4, v5, v6
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v7
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v3, v4
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
-; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add3_u32 v3, v6, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s10, v0
@@ -472,15 +469,16 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v10, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v11, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v9, v2, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
@@ -512,16 +510,16 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX10-NEXT:    s_mov_b32 s3, s2
 ; GFX10-NEXT:    s_addc_u32 s9, s11, s12
-; GFX10-NEXT:    s_xor_b64 s[10:11], s[0:1], s[2:3]
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
 ; GFX10-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX10-NEXT:    s_sub_u32 s1, 0, s8
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    s_sub_u32 s10, 0, s8
+; GFX10-NEXT:    s_cselect_b32 s11, 1, 0
+; GFX10-NEXT:    s_and_b32 s11, s11, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    s_subb_u32 s14, 0, s9
+; GFX10-NEXT:    s_cmp_lg_u32 s11, 0
+; GFX10-NEXT:    s_subb_u32 s11, 0, s9
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -530,11 +528,11 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_mul_lo_u32 v2, s1, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, s10, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s14, v0
-; GFX10-NEXT:    v_mul_hi_u32 v4, s1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s11, v0
+; GFX10-NEXT:    v_mul_hi_u32 v4, s10, v0
+; GFX10-NEXT:    v_mul_lo_u32 v5, s10, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v1, v5
@@ -543,111 +541,110 @@ define amdgpu_kernel void @sdivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX10-NEXT:    v_add_co_u32 v3, s0, v3, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v7, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v3, s0, v3, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v6, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v3, s14, v3, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s14
+; GFX10-NEXT:    v_add_co_u32 v6, s14, v7, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s14
+; GFX10-NEXT:    v_add_co_u32 v3, s14, v3, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s14
+; GFX10-NEXT:    v_add_co_u32 v5, s14, v6, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s14
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, v7, v6
-; GFX10-NEXT:    v_add_co_u32 v3, s0, v5, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v3, s14, v5, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s14
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX10-NEXT:    v_add3_u32 v2, v4, v5, v2
-; GFX10-NEXT:    v_mul_lo_u32 v4, s14, v0
-; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v1, v2, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v7, s1, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v7
-; GFX10-NEXT:    v_add3_u32 v4, v4, v6, v5
-; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v7
-; GFX10-NEXT:    v_mul_hi_u32 v7, v0, v7
-; GFX10-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX10-NEXT:    v_mul_lo_u32 v9, v3, v4
-; GFX10-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GFX10-NEXT:    v_mul_hi_u32 v3, v3, v4
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v8, s0, v9, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v7, s0, v8, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v6, v5
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v9, v8
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v7, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add3_u32 v2, v4, v6, v3
+; GFX10-NEXT:    v_mul_hi_u32 v3, s10, v0
+; GFX10-NEXT:    v_mul_lo_u32 v5, s10, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s11, v0
-; GFX10-NEXT:    v_mul_hi_u32 v4, s11, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX10-NEXT:    v_mul_lo_u32 v5, s11, v1
-; GFX10-NEXT:    v_mul_hi_u32 v6, s10, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, s11, v1
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v3
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v5, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v2, s0, v4, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT:    v_mul_lo_u32 v4, s10, v1
+; GFX10-NEXT:    v_mul_hi_u32 v6, v1, v5
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, v3
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, v5
+; GFX10-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GFX10-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX10-NEXT:    v_mul_lo_u32 v7, v1, v2
+; GFX10-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX10-NEXT:    v_add_co_u32 v3, s10, v3, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v6, s10, v7, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v3, s10, v3, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v5, s10, v6, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s10
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v7, v6
+; GFX10-NEXT:    v_add_co_u32 v3, s10, v5, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX10-NEXT:    v_add3_u32 v2, v4, v5, v2
+; GFX10-NEXT:    v_mul_hi_u32 v4, s1, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v1
+; GFX10-NEXT:    v_mul_hi_u32 v6, s0, v1
+; GFX10-NEXT:    v_mul_hi_u32 v1, s1, v1
+; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v4, s10, v5, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
+; GFX10-NEXT:    v_add_co_u32 v2, s10, v4, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v3, v0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v5, v4
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v2, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
+; GFX10-NEXT:    v_add_co_u32 v0, s10, v2, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s10
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s8, v0
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 1
 ; GFX10-NEXT:    v_add3_u32 v1, v3, v2, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s8, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v4, s8, v1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, v3
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v0, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s11, v2
-; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s10, v5
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v2, s0, s11, v2, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v6, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v7, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s1, v2
+; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s0, v5
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v2, s0, s1, v2, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v5, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v5, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v11, s0, 0, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v9
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s9, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v13, s0, v3, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v2
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s0
-; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v12, v9, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v10, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v12
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v9
 ; GFX10-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v13, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s8, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s9, v1
@@ -1365,9 +1362,9 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_sub_u32 s16, 0, s8
-; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
+; GFX8-NEXT:    s_sub_u32 s0, 0, s8
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1375,17 +1372,46 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_subb_u32 s17, 0, s9
-; GFX8-NEXT:    v_mul_lo_u32 v2, s16, v1
-; GFX8-NEXT:    v_mul_lo_u32 v3, s17, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s16, v0
-; GFX8-NEXT:    v_mul_lo_u32 v4, s16, v0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s9
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_subb_u32 s1, 0, s9
+; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v6, v0, v4
+; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, v1, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
+; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s9
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
@@ -1406,38 +1432,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v2, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v4, s17, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, s16, v3
-; GFX8-NEXT:    v_mul_hi_u32 v8, s16, v0
-; GFX8-NEXT:    v_mul_lo_u32 v7, s16, v0
-; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v2
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v5
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v8
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v7
-; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v7
-; GFX8-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
-; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v7, v5
-; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v4
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s15, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s14, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s14, v0
@@ -1494,17 +1489,18 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX8-NEXT:    s_ashr_i32 s8, s11, 31
 ; GFX8-NEXT:    s_ashr_i32 s12, s3, 31
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    s_add_u32 s0, s10, s8
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -1523,32 +1519,61 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s3
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v5, s2
-; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
-; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    s_mov_b32 s9, s8
+; GFX8-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
 ; GFX8-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
 ; GFX8-NEXT:    v_add_f32_e32 v4, v4, v5
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v4
-; GFX8-NEXT:    v_mov_b32_e32 v6, s6
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GFX8-NEXT:    s_sub_u32 s0, 0, s2
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX8-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
+; GFX8-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX8-NEXT:    v_mul_f32_e32 v7, 0xcf800000, v6
+; GFX8-NEXT:    v_add_f32_e32 v4, v7, v4
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_subb_u32 s1, 0, s3
+; GFX8-NEXT:    v_mul_lo_u32 v4, s1, v7
+; GFX8-NEXT:    v_mul_lo_u32 v8, s0, v6
+; GFX8-NEXT:    v_mul_hi_u32 v10, s0, v7
+; GFX8-NEXT:    v_mul_lo_u32 v9, s0, v7
+; GFX8-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v8
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v4, v10
+; GFX8-NEXT:    v_xor_b32_e32 v2, s6, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_mul_lo_u32 v10, v6, v9
+; GFX8-NEXT:    v_mul_lo_u32 v11, v7, v8
 ; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s6, v3
-; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v2, v6, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v7
-; GFX8-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; GFX8-NEXT:    s_mov_b32 s9, s8
-; GFX8-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX8-NEXT:    s_xor_b64 s[6:7], s[0:1], s[8:9]
-; GFX8-NEXT:    v_mul_f32_e32 v6, 0xcf800000, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v6, v2
-; GFX8-NEXT:    s_sub_u32 s10, 0, s2
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_subb_u32 s11, 0, s3
-; GFX8-NEXT:    v_mul_lo_u32 v6, s11, v2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s10, v3
-; GFX8-NEXT:    v_mul_hi_u32 v9, s10, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, s10, v2
+; GFX8-NEXT:    v_subb_u32_e32 v5, vcc, v2, v5, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v2, v7, v9
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v10, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v3, v6, v8
+; GFX8-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT:    v_mul_hi_u32 v10, v7, v8
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v10
+; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v9, v3
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v8, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v7, v2
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v6, v3, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, s1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s0, v3
+; GFX8-NEXT:    v_mul_hi_u32 v9, s0, v2
+; GFX8-NEXT:    v_mul_lo_u32 v8, s0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
@@ -1556,6 +1581,7 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v9, v2, v6
 ; GFX8-NEXT:    v_mul_hi_u32 v11, v2, v8
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v3, v8
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v11
@@ -1574,56 +1600,25 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
-; GFX8-NEXT:    v_addc_u32_e64 v7, s[0:1], v3, v6, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v8, s11, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, s10, v7
-; GFX8-NEXT:    v_mul_hi_u32 v12, s10, v2
-; GFX8-NEXT:    v_mul_lo_u32 v11, s10, v2
-; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v6
-; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v8, v9
-; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v8, v12
-; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v11
-; GFX8-NEXT:    v_mul_lo_u32 v12, v2, v8
-; GFX8-NEXT:    v_mul_hi_u32 v6, v2, v11
-; GFX8-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v9, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v8
-; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v12, v6
-; GFX8-NEXT:    v_mul_hi_u32 v12, v2, v8
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v11, s[0:1], v11, v12
-; GFX8-NEXT:    v_mul_hi_u32 v7, v7, v8
-; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v9, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v11, v9
-; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, s7, v2
-; GFX8-NEXT:    v_mul_lo_u32 v7, s6, v3
-; GFX8-NEXT:    v_mul_hi_u32 v9, s6, v2
-; GFX8-NEXT:    v_mul_hi_u32 v2, s7, v2
-; GFX8-NEXT:    v_mov_b32_e32 v8, s7
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, s11, v2
+; GFX8-NEXT:    v_mul_lo_u32 v7, s10, v3
+; GFX8-NEXT:    v_mul_hi_u32 v9, s10, v2
+; GFX8-NEXT:    v_mul_hi_u32 v2, s11, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, s11
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v9, s7, v3
+; GFX8-NEXT:    v_mul_lo_u32 v9, s11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
-; GFX8-NEXT:    v_mul_hi_u32 v7, s6, v3
+; GFX8-NEXT:    v_mul_hi_u32 v7, s10, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v9, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
-; GFX8-NEXT:    v_mul_hi_u32 v3, s7, v3
+; GFX8-NEXT:    v_mul_hi_u32 v3, s11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v7, v6
@@ -1634,9 +1629,9 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_mul_lo_u32 v9, s2, v2
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v11
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s6, v9
+; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s10, v9
 ; GFX8-NEXT:    v_subb_u32_e64 v8, s[0:1], v8, v6, vcc
-; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s7, v6
+; GFX8-NEXT:    v_sub_u32_e64 v6, s[0:1], s11, v6
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v7
@@ -1665,7 +1660,6 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v11, v13, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, v12, v6, vcc
-; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
@@ -1717,9 +1711,9 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_sub_u32 s16, 0, s8
-; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
+; GFX9-NEXT:    s_sub_u32 s0, 0, s8
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1727,12 +1721,13 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_subb_u32 s17, 0, s9
-; GFX9-NEXT:    v_mul_lo_u32 v2, s16, v1
-; GFX9-NEXT:    v_mul_lo_u32 v3, s17, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s16, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s16, v0
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_subb_u32 s1, 0, s9
+; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, s15
 ; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
@@ -1755,41 +1750,39 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s17, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s16, v3
-; GFX9-NEXT:    v_mul_hi_u32 v6, s16, v0
-; GFX9-NEXT:    v_mul_lo_u32 v7, s16, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    v_add3_u32 v4, v4, v5, v6
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v7
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v3, v4
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
-; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add3_u32 v3, v6, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s15, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s14, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s14, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s15, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s15
+; GFX9-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1810,7 +1803,6 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
 ; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s14, v6
 ; GFX9-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v7, v2, vcc
@@ -1829,24 +1821,17 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v8
-; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s8, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v9
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1]
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v4, v2, s[0:1]
+; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s8, v7
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[6:7], s[12:13]
 ; GFX9-NEXT:    s_ashr_i32 s8, s11, 31
 ; GFX9-NEXT:    s_ashr_i32 s12, s3, 31
@@ -1861,112 +1846,118 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX9-NEXT:    s_mov_b32 s13, s12
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s12
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v4, s3
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v5, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v6, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v6
+; GFX9-NEXT:    v_add_f32_e32 v4, v4, v7
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    s_xor_b64 s[10:11], s[10:11], s[8:9]
-; GFX9-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GFX9-NEXT:    s_sub_u32 s7, 0, s2
-; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX9-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GFX9-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v6, v6
-; GFX9-NEXT:    v_mul_f32_e32 v7, 0xcf800000, v6
-; GFX9-NEXT:    v_add_f32_e32 v4, v7, v4
+; GFX9-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX9-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX9-NEXT:    v_mul_f32_e32 v6, 0xcf800000, v5
+; GFX9-NEXT:    v_add_f32_e32 v4, v6, v4
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX9-NEXT:    s_and_b32 s1, s1, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
+; GFX9-NEXT:    s_and_b32 s14, s14, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX9-NEXT:    s_subb_u32 s14, 0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s14, v4
-; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v6
-; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v4
-; GFX9-NEXT:    v_mul_lo_u32 v7, s7, v4
+; GFX9-NEXT:    v_mul_lo_u32 v6, s14, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, s7, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, s7, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX9-NEXT:    v_add3_u32 v6, v6, v7, v8
+; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v10, s1
+; GFX9-NEXT:    v_mul_lo_u32 v7, v5, v9
+; GFX9-NEXT:    v_mul_lo_u32 v8, v4, v6
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
-; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
-; GFX9-NEXT:    v_add3_u32 v5, v8, v9, v10
-; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v7
-; GFX9-NEXT:    v_mul_lo_u32 v9, v4, v5
-; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GFX9-NEXT:    v_xor_b32_e32 v3, s6, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
+; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v5
-; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
-; GFX9-NEXT:    v_mul_hi_u32 v9, v4, v5
-; GFX9-NEXT:    v_mul_hi_u32 v5, v6, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v10, v5, v6
+; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v9
+; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
+; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
 ; GFX9-NEXT:    v_add_u32_e32 v9, v10, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v5, v9, v8, v5
+; GFX9-NEXT:    v_add3_u32 v6, v9, v8, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
-; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], v6, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, s14, v4
-; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v7
-; GFX9-NEXT:    v_mul_hi_u32 v10, s7, v4
-; GFX9-NEXT:    v_mul_lo_u32 v11, s7, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, s14, v4
+; GFX9-NEXT:    v_mul_lo_u32 v7, s7, v5
+; GFX9-NEXT:    v_mul_hi_u32 v8, s7, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v4
+; GFX9-NEXT:    v_xor_b32_e32 v3, s6, v3
 ; GFX9-NEXT:    v_xor_b32_e32 v2, s6, v2
-; GFX9-NEXT:    v_add3_u32 v8, v8, v9, v10
-; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v11
-; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v6, v4, v11
-; GFX9-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GFX9-NEXT:    v_mov_b32_e32 v12, s6
-; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v9, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v8
-; GFX9-NEXT:    v_add_u32_e32 v6, v10, v6
-; GFX9-NEXT:    v_mul_hi_u32 v10, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v7, v7, v8
-; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v9, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v9, v6
-; GFX9-NEXT:    v_add_u32_e32 v10, v11, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add3_u32 v7, v10, v8, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v5, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v6
-; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v7
+; GFX9-NEXT:    v_add3_u32 v6, v6, v7, v8
+; GFX9-NEXT:    v_mul_lo_u32 v7, v5, v9
+; GFX9-NEXT:    v_mul_lo_u32 v8, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v9
+; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v9
+; GFX9-NEXT:    v_mov_b32_e32 v10, s6
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v11, v5, v6
+; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
+; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v11, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
+; GFX9-NEXT:    v_add_u32_e32 v9, v11, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v6, v9, v8, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v4, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v5, v6, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v7
+; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v6
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s6, v3
-; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v2, v12, vcc
-; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v6
+; GFX9-NEXT:    v_subb_co_u32_e32 v5, vcc, v2, v10, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s11, v7
-; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v6
-; GFX9-NEXT:    v_add_u32_e32 v2, v8, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v7
+; GFX9-NEXT:    v_mul_lo_u32 v3, s11, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s11, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_u32_e32 v2, v8, v2
+; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, s11, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v6, v3, v7
+; GFX9-NEXT:    v_add3_u32 v3, v7, v3, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s3, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v8, s2, v2
@@ -2047,57 +2038,57 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    s_mov_b32 s7, s6
 ; GFX10-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s1, s6
-; GFX10-NEXT:    s_xor_b64 s[14:15], s[14:15], s[12:13]
 ; GFX10-NEXT:    s_xor_b64 s[8:9], s[0:1], s[6:7]
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX10-NEXT:    s_sub_u32 s22, 0, s8
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_sub_u32 s20, 0, s8
+; GFX10-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s14, s14, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    s_subb_u32 s23, 0, s9
-; GFX10-NEXT:    s_ashr_i32 s16, s11, 31
-; GFX10-NEXT:    s_xor_b64 s[20:21], s[12:13], s[6:7]
-; GFX10-NEXT:    s_ashr_i32 s18, s3, 31
+; GFX10-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX10-NEXT:    s_subb_u32 s21, 0, s9
+; GFX10-NEXT:    s_ashr_i32 s14, s11, 31
+; GFX10-NEXT:    s_xor_b64 s[18:19], s[12:13], s[6:7]
+; GFX10-NEXT:    s_ashr_i32 s16, s3, 31
 ; GFX10-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX10-NEXT:    s_add_u32 s0, s10, s16
-; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
-; GFX10-NEXT:    s_mov_b32 s19, s18
-; GFX10-NEXT:    s_and_b32 s1, s1, 1
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-NEXT:    s_add_u32 s6, s10, s14
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-NEXT:    s_mov_b32 s17, s16
-; GFX10-NEXT:    s_addc_u32 s1, s11, s16
-; GFX10-NEXT:    s_add_u32 s2, s2, s18
-; GFX10-NEXT:    s_cselect_b32 s6, 1, 0
-; GFX10-NEXT:    s_and_b32 s6, s6, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX10-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
+; GFX10-NEXT:    s_mov_b32 s15, s14
+; GFX10-NEXT:    s_addc_u32 s7, s11, s14
+; GFX10-NEXT:    s_add_u32 s2, s2, s16
+; GFX10-NEXT:    s_cselect_b32 s10, 1, 0
+; GFX10-NEXT:    s_and_b32 s10, s10, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s10, 0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GFX10-NEXT:    s_addc_u32 s3, s3, s18
-; GFX10-NEXT:    s_xor_b64 s[10:11], s[0:1], s[16:17]
-; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[18:19]
+; GFX10-NEXT:    s_addc_u32 s3, s3, s16
+; GFX10-NEXT:    s_xor_b64 s[10:11], s[6:7], s[14:15]
+; GFX10-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s2
 ; GFX10-NEXT:    s_sub_u32 s6, 0, s2
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s7, 1, 0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v1
 ; GFX10-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX10-NEXT:    s_and_b32 s0, s0, 1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_and_b32 s7, s7, 1
+; GFX10-NEXT:    s_cmp_lg_u32 s7, 0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v1, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0xcf800000, v2
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX10-NEXT:    s_subb_u32 s7, 0, s3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v3, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s22, v2
+; GFX10-NEXT:    v_mul_lo_u32 v3, s20, v2
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GFX10-NEXT:    v_mul_lo_u32 v4, s23, v0
-; GFX10-NEXT:    v_mul_hi_u32 v5, s22, v0
-; GFX10-NEXT:    v_mul_lo_u32 v6, s22, v0
+; GFX10-NEXT:    v_mul_lo_u32 v4, s21, v0
+; GFX10-NEXT:    v_mul_hi_u32 v5, s20, v0
+; GFX10-NEXT:    v_mul_lo_u32 v6, s20, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v1
 ; GFX10-NEXT:    v_add3_u32 v3, v4, v3, v5
 ; GFX10-NEXT:    v_trunc_f32_e32 v4, v7
@@ -2111,23 +2102,23 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GFX10-NEXT:    v_add_f32_e32 v1, v9, v1
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v10, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v7
+; GFX10-NEXT:    v_add_co_u32 v5, s13, v5, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s13
+; GFX10-NEXT:    v_add_co_u32 v6, s13, v10, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s13
+; GFX10-NEXT:    v_add_co_u32 v5, s13, v5, v7
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s13
 ; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v4
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v11
+; GFX10-NEXT:    v_add_co_u32 v6, s13, v6, v11
 ; GFX10-NEXT:    v_mul_lo_u32 v12, s7, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v13, s6, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v8, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s13
 ; GFX10-NEXT:    v_mul_lo_u32 v11, s6, v1
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v6, v5
+; GFX10-NEXT:    v_add_co_u32 v5, s13, v6, v5
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v10, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s13
 ; GFX10-NEXT:    v_add3_u32 v8, v12, v9, v13
 ; GFX10-NEXT:    v_mul_lo_u32 v9, v4, v11
 ; GFX10-NEXT:    v_mul_hi_u32 v10, v1, v11
@@ -2136,116 +2127,112 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v1, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v4, v8
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v12, s0, v2, v3, vcc_lo
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_mul_hi_u32 v5, v1, v8
-; GFX10-NEXT:    v_mul_lo_u32 v14, s23, v0
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v9, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v11
-; GFX10-NEXT:    v_mul_hi_u32 v15, s22, v0
-; GFX10-NEXT:    v_mul_lo_u32 v16, s22, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_mul_lo_u32 v13, s22, v0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v7, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
+; GFX10-NEXT:    v_mul_lo_u32 v12, s21, v0
+; GFX10-NEXT:    v_add_co_u32 v6, s13, v9, v6
+; GFX10-NEXT:    v_mul_hi_u32 v13, s20, v0
+; GFX10-NEXT:    v_mul_lo_u32 v14, s20, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s13
+; GFX10-NEXT:    v_add_co_u32 v7, s13, v7, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s13
+; GFX10-NEXT:    v_add_co_u32 v6, s13, v6, v10
+; GFX10-NEXT:    v_mul_lo_u32 v3, s20, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s13
+; GFX10-NEXT:    v_add_co_u32 v5, s13, v7, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s13
+; GFX10-NEXT:    v_add3_u32 v12, v12, v14, v13
 ; GFX10-NEXT:    v_add_nc_u32_e32 v6, v9, v6
-; GFX10-NEXT:    v_add3_u32 v14, v14, v16, v15
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v4, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v3
-; GFX10-NEXT:    v_mul_lo_u32 v10, v12, v13
+; GFX10-NEXT:    v_mul_lo_u32 v10, v2, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v6
-; GFX10-NEXT:    v_mul_lo_u32 v11, v0, v14
-; GFX10-NEXT:    v_mul_hi_u32 v9, v0, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_mul_hi_u32 v13, v12, v13
-; GFX10-NEXT:    v_mul_lo_u32 v15, v12, v14
-; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v5
+; GFX10-NEXT:    v_mul_lo_u32 v11, v0, v12
+; GFX10-NEXT:    v_add_co_u32 v5, s13, v5, v6
+; GFX10-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s13
+; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX10-NEXT:    v_mul_lo_u32 v13, v2, v12
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v5
+; GFX10-NEXT:    v_add_co_u32 v5, s13, v10, v11
+; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v12
 ; GFX10-NEXT:    v_add3_u32 v6, v7, v6, v8
-; GFX10-NEXT:    v_add_co_u32 v5, s1, v10, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
-; GFX10-NEXT:    v_mul_hi_u32 v16, v0, v14
-; GFX10-NEXT:    v_add_co_u32 v8, s1, v15, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v11, s1, v4, v6, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s1, v5, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v16
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s1
-; GFX10-NEXT:    v_mul_lo_u32 v13, s7, v1
+; GFX10-NEXT:    v_mul_lo_u32 v10, s7, v1
+; GFX10-NEXT:    v_add_co_u32 v5, s7, v5, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s13
+; GFX10-NEXT:    v_add_co_u32 v3, s13, v13, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s7
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, v4, v6, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v3, s7, v3, v14
 ; GFX10-NEXT:    v_add_nc_u32_e32 v5, v7, v5
-; GFX10-NEXT:    v_mul_hi_u32 v15, s6, v1
-; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v11
-; GFX10-NEXT:    v_mul_hi_u32 v7, v12, v14
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v10, v16
-; GFX10-NEXT:    v_add_co_u32 v5, s1, v8, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v6
-; GFX10-NEXT:    v_add3_u32 v9, v13, v9, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s13
+; GFX10-NEXT:    v_mul_hi_u32 v11, s6, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s7
+; GFX10-NEXT:    v_mul_lo_u32 v13, s6, v4
+; GFX10-NEXT:    v_mul_hi_u32 v7, v2, v12
+; GFX10-NEXT:    v_mul_lo_u32 v6, s6, v1
+; GFX10-NEXT:    v_add_co_u32 v3, s6, v3, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, v8, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    v_add3_u32 v9, v10, v13, v11
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX10-NEXT:    v_add3_u32 v5, v8, v5, v7
+; GFX10-NEXT:    v_mul_lo_u32 v10, v4, v6
+; GFX10-NEXT:    v_mul_lo_u32 v7, v1, v9
+; GFX10-NEXT:    v_mul_hi_u32 v11, v1, v6
+; GFX10-NEXT:    v_mul_hi_u32 v6, v4, v6
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v3, v4, v9
+; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v0
+; GFX10-NEXT:    v_mul_hi_u32 v12, s0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v13, s0, v2
+; GFX10-NEXT:    v_add_co_u32 v7, s6, v10, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s6
+; GFX10-NEXT:    v_mul_hi_u32 v0, s1, v0
+; GFX10-NEXT:    v_add_co_u32 v3, s6, v3, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v7, s6, v7, v11
+; GFX10-NEXT:    v_mul_lo_u32 v14, s1, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v5, s6, v5, v13
+; GFX10-NEXT:    v_mul_hi_u32 v15, s0, v2
+; GFX10-NEXT:    v_mul_hi_u32 v8, v1, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v5, s7, v5, v12
+; GFX10-NEXT:    v_add_co_u32 v0, s6, v14, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s6
+; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v2
+; GFX10-NEXT:    v_add_co_u32 v0, s6, v0, v15
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v11, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v3, s6, v3, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v0, s6, v0, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v11, v12, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v10, v7
+; GFX10-NEXT:    v_mul_hi_u32 v9, v4, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v8
+; GFX10-NEXT:    v_mul_hi_u32 v8, s8, v0
+; GFX10-NEXT:    v_add3_u32 v2, v11, v5, v2
+; GFX10-NEXT:    v_add_co_u32 v3, s6, v3, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s6
+; GFX10-NEXT:    v_mul_lo_u32 v7, s9, v0
+; GFX10-NEXT:    v_mul_lo_u32 v10, s8, v2
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
+; GFX10-NEXT:    v_add3_u32 v5, v6, v5, v9
+; GFX10-NEXT:    v_mul_lo_u32 v6, s8, v0
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
-; GFX10-NEXT:    v_add3_u32 v7, v10, v8, v7
-; GFX10-NEXT:    v_mul_lo_u32 v14, v1, v9
-; GFX10-NEXT:    v_mul_lo_u32 v12, v11, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v7, vcc_lo
-; GFX10-NEXT:    v_mul_hi_u32 v13, v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v3, v11, v3
-; GFX10-NEXT:    v_mul_lo_u32 v8, v11, v9
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
-; GFX10-NEXT:    v_mul_hi_u32 v10, v1, v9
-; GFX10-NEXT:    v_add_co_u32 v7, s1, v12, v14
-; GFX10-NEXT:    v_mul_hi_u32 v9, v11, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v3, s1, v8, v3
-; GFX10-NEXT:    v_mul_lo_u32 v8, s15, v0
-; GFX10-NEXT:    v_mul_lo_u32 v14, s14, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s1
-; GFX10-NEXT:    v_mul_hi_u32 v12, s14, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s15, v0
-; GFX10-NEXT:    v_add_co_u32 v7, s1, v7, v13
-; GFX10-NEXT:    v_mul_lo_u32 v13, s15, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v3, s1, v3, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v0, s1, v13, v0
-; GFX10-NEXT:    v_mul_hi_u32 v15, s14, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v7
-; GFX10-NEXT:    v_mul_hi_u32 v2, s15, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v10
-; GFX10-NEXT:    v_add_co_u32 v0, s1, v0, v15
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, v14, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v0, s1, v0, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v13, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v3, s1, v3, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s1
-; GFX10-NEXT:    v_mul_lo_u32 v6, s9, v0
-; GFX10-NEXT:    v_add3_u32 v2, v10, v8, v2
-; GFX10-NEXT:    v_add3_u32 v5, v5, v7, v9
-; GFX10-NEXT:    v_mul_hi_u32 v7, s8, v0
-; GFX10-NEXT:    v_mul_lo_u32 v8, s8, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, vcc_lo, v4, v5, s0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s8, v0
-; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v4, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v4, v6, v8, v7
-; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v1
-; GFX10-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s14, v5
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s15, v4
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v4, s0, s15, v4, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v5
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v4, v5, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v4, v7, v10, v8
+; GFX10-NEXT:    v_mul_lo_u32 v5, s11, v1
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, s0, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v14, s10, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s1, v4
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v4, s0, s1, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v6
+; GFX10-NEXT:    v_mul_hi_u32 v7, s11, v1
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
@@ -2253,104 +2240,105 @@ define amdgpu_kernel void @sdivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s10, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v17, s10, v3
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v5, s8
+; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v6, s8
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s11, v3
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v13
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v14
+; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v7, s0, v15, v7
-; GFX10-NEXT:    v_add_co_u32 v1, s1, v6, v1
+; GFX10-NEXT:    v_add_co_u32 v1, s1, v5, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v17
 ; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v17, s0, v0, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s0, 0, v2, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v14, v1
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v13
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v15
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, v5, v15
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s0
 ; GFX10-NEXT:    v_add_co_u32 v7, s0, v7, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v14, s0, v17, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v18, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_add3_u32 v3, v5, v1, v3
+; GFX10-NEXT:    v_sub_co_u32 v1, s0, v12, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v5, s0, 0, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v17, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v14, v18, v15, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v15, s3, v7
+; GFX10-NEXT:    v_mul_lo_u32 v16, s2, v3
+; GFX10-NEXT:    v_mul_hi_u32 v17, s2, v7
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v10
+; GFX10-NEXT:    v_mul_lo_u32 v10, s2, v7
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v11
-; GFX10-NEXT:    v_add3_u32 v3, v6, v1, v3
-; GFX10-NEXT:    v_mul_lo_u32 v10, s3, v7
-; GFX10-NEXT:    v_mul_lo_u32 v16, s2, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v18, v15, s0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s2, v3
-; GFX10-NEXT:    v_mul_hi_u32 v15, s2, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v17, v14, s0
-; GFX10-NEXT:    v_sub_co_u32 v14, s1, v12, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s1, 0, v8, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v6, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v6, v10, v11, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v13, v8, s0
-; GFX10-NEXT:    v_sub_co_u32 v8, s0, s10, v16
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v10, s1, s11, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s11, v6
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v10
-; GFX10-NEXT:    v_xor_b32_e32 v0, s20, v0
-; GFX10-NEXT:    v_xor_b32_e32 v1, s21, v1
-; GFX10-NEXT:    v_xor_b32_e32 v5, s12, v5
-; GFX10-NEXT:    v_xor_b32_e32 v2, s12, v2
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v4, vcc_lo, s3, v4, s0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v8, s2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v4, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s20
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s21, v1, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v4, vcc_lo, s3, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v8, v15, v16, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v13, v5, s0
+; GFX10-NEXT:    v_sub_co_u32 v10, s0, s10, v10
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v11, s1, s11, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s11, v8
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v11
+; GFX10-NEXT:    v_xor_b32_e32 v0, s18, v0
+; GFX10-NEXT:    v_xor_b32_e32 v2, s19, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, vcc_lo, s3, v1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v10, s2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v8, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s18
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s19, v2, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v11
+; GFX10-NEXT:    v_xor_b32_e32 v2, s12, v6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s3, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
 ; GFX10-NEXT:    v_add_co_u32 v15, s0, v7, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v3, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v11, v14, s0
-; GFX10-NEXT:    v_add_co_u32 v14, s0, v15, 1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v6, v12, s0
+; GFX10-NEXT:    v_add_co_u32 v12, s0, v15, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v16, s0
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_sub_co_u32 v11, s0, v12, s2
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v4, s0, 0, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v15, v14, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v6
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_sub_co_u32 v6, s0, v13, s2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s0, 0, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v15, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v12, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v14, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v13, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v8, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v8, s12, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v12, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v10, v4, s0
-; GFX10-NEXT:    s_xor_b64 s[0:1], s[16:17], s[18:19]
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v5, s12
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s12, v2, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v10, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v11, v6, s0
+; GFX10-NEXT:    s_xor_b64 s[0:1], s[14:15], s[16:17]
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v2, s12
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s0, v7
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s1, v3
-; GFX10-NEXT:    v_xor_b32_e32 v6, s16, v6
-; GFX10-NEXT:    v_xor_b32_e32 v7, s16, v8
+; GFX10-NEXT:    v_xor_b32_e32 v7, s14, v10
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s12, v8, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v8, s14, v6
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v2, s0
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v3, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, s16
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s16, v7, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s14
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s14, v8, vcc_lo
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v9, v[0:3], s[4:5]
 ; GFX10-NEXT:    global_store_dwordx4 v9, v[4:7], s[6:7]

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index 676178c6de26..df70a90fd3c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -21,40 +21,40 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
 ; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v3, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v3, v1, v0
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v2, v0
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v3
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v0
-; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v1
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v6
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v5
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v5, v6, vcc
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v6
+; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
+; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
 ; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v2
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v6
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v2
-; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v1
+; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v11
+; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v11
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v2, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v11
-; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v11
-; CHECK-NEXT:    v_xor_b32_e32 v5, v5, v1
+; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v11
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v13, v6, v9
+; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v9
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
@@ -62,102 +62,100 @@ define i64 @v_srem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
+; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CHECK-NEXT:    v_addc_u32_e64 v10, s[4:5], v6, v9, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v9, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v8, v2
-; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v10
-; CHECK-NEXT:    v_mul_lo_u32 v12, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v9, v7, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v2
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v9
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v10, v12
-; CHECK-NEXT:    v_mul_lo_u32 v11, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v12
-; CHECK-NEXT:    v_mul_hi_u32 v12, v10, v12
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v9, v10, v7
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v7
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v7
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v10
+; CHECK-NEXT:    v_mul_lo_u32 v9, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v10
+; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v10
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v11, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, v5, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v6
-; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v5, v2
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v4, v2
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v6
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_mul_hi_u32 v8, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v6, v5, v6
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v6
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_subb_u32_e64 v6, s[4:5], v5, v2, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
+; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v4, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v4, v2
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v0
 ; CHECK-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, v5, v7, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v4, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v3, v1
 ; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v3
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v0
 ; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v3
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v1
-; CHECK-NEXT:    v_xor_b32_e32 v3, v0, v1
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v2, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT:    v_xor_b32_e32 v2, v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v1, v6
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v2, v6, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr2
 ; CHECK-NEXT:    ; implicit-def: $vgpr4
 ; CHECK-NEXT:  BB0_2: ; %Flow
@@ -221,24 +219,53 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; CHECK-NEXT:    s_mov_b32 s7, s6
 ; CHECK-NEXT:    s_xor_b64 s[10:11], s[10:11], s[6:7]
-; CHECK-NEXT:    s_sub_u32 s3, 0, s8
+; CHECK-NEXT:    s_sub_u32 s0, 0, s8
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; CHECK-NEXT:    s_cselect_b32 s0, 1, 0
-; CHECK-NEXT:    s_and_b32 s0, s0, 1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
+; CHECK-NEXT:    s_cselect_b32 s1, 1, 0
+; CHECK-NEXT:    s_and_b32 s1, s1, 1
+; CHECK-NEXT:    s_cmp_lg_u32 s1, 0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT:    s_subb_u32 s5, 0, s9
+; CHECK-NEXT:    s_subb_u32 s1, 0, s9
+; CHECK-NEXT:    v_mul_lo_u32 v3, s0, v1
+; CHECK-NEXT:    v_mul_lo_u32 v2, s1, v0
+; CHECK-NEXT:    v_mul_hi_u32 v5, s0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v4, s0, v0
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT:    v_mul_lo_u32 v3, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
+; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v5, v0, v2
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, s1, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, s0, v1
+; CHECK-NEXT:    v_mul_hi_u32 v5, s0, v0
+; CHECK-NEXT:    v_mul_lo_u32 v4, s0, v0
 ; CHECK-NEXT:    v_mov_b32_e32 v6, s9
-; CHECK-NEXT:    v_mul_lo_u32 v3, s3, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, s5, v0
-; CHECK-NEXT:    v_mul_hi_u32 v5, s3, v0
-; CHECK-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_mul_lo_u32 v3, v1, v4
@@ -263,38 +290,7 @@ define amdgpu_ps i64 @s_srem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v2, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v4, s5, v0
-; CHECK-NEXT:    v_mul_lo_u32 v5, s3, v3
-; CHECK-NEXT:    v_mul_hi_u32 v8, s3, v0
-; CHECK-NEXT:    v_mul_lo_u32 v7, s3, v0
-; CHECK-NEXT:    v_add_i32_e64 v1, s[0:1], v1, v2
-; CHECK-NEXT:    v_add_i32_e64 v4, s[0:1], v4, v5
-; CHECK-NEXT:    v_add_i32_e64 v4, s[0:1], v4, v8
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v7
-; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v7
-; CHECK-NEXT:    v_add_i32_e64 v5, s[0:1], v5, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; CHECK-NEXT:    v_add_i32_e64 v2, s[0:1], v5, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v4
-; CHECK-NEXT:    v_add_i32_e64 v2, s[0:1], v8, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v4
-; CHECK-NEXT:    v_add_i32_e64 v5, s[0:1], v5, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; CHECK-NEXT:    v_add_i32_e64 v5, s[0:1], v5, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[0:1], v7, v8
-; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v4
-; CHECK-NEXT:    v_add_i32_e64 v2, s[0:1], v5, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; CHECK-NEXT:    v_add_i32_e64 v4, s[0:1], v7, v5
-; CHECK-NEXT:    v_add_i32_e64 v3, s[0:1], v3, v4
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v3, s10, v1
 ; CHECK-NEXT:    v_mul_hi_u32 v5, s10, v0
@@ -445,38 +441,36 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v10, v13, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v13, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v12, v8
-; GISEL-NEXT:    v_mul_lo_u32 v15, v11, v14
-; GISEL-NEXT:    v_mul_lo_u32 v16, v11, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v14, v11, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v11, v8
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v16
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v16
-; GISEL-NEXT:    v_mul_hi_u32 v16, v14, v16
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
-; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v11, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v14
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v11
+; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v10, v14
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v15, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v10, v11, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v8
@@ -581,40 +575,38 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], v8, v12, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v7
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v13
-; GISEL-NEXT:    v_mul_lo_u32 v15, v10, v7
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v7
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v15
-; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v13, v15
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v10
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v10
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v10
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v13
+; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v13
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v10
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v2, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
@@ -695,35 +687,35 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v5
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v0
 ; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v5, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v1, v0
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; CGP-NEXT:    v_xor_b32_e32 v0, v2, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v3
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v11
-; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v0
+; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v11
+; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v1
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v11, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v4
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, v11, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v2
 ; CGP-NEXT:    v_trunc_f32_e32 v10, v10
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v10
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v3
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, 0, v1
 ; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v12, v2
 ; CGP-NEXT:    v_mul_lo_u32 v14, v11, v10
 ; CGP-NEXT:    v_mul_hi_u32 v16, v11, v2
 ; CGP-NEXT:    v_mul_lo_u32 v15, v11, v2
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v1
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_mul_lo_u32 v14, v10, v15
 ; CGP-NEXT:    v_mul_lo_u32 v16, v2, v13
 ; CGP-NEXT:    v_mul_hi_u32 v17, v2, v15
 ; CGP-NEXT:    v_mul_hi_u32 v15, v10, v15
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v1
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v4
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
@@ -742,41 +734,39 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v14
-; CGP-NEXT:    v_addc_u32_e64 v14, s[4:5], v10, v13, vcc
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v13, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v12, v2
-; CGP-NEXT:    v_mul_lo_u32 v15, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v16, v11, v2
+; CGP-NEXT:    v_mul_lo_u32 v13, v11, v10
+; CGP-NEXT:    v_mul_lo_u32 v14, v11, v2
 ; CGP-NEXT:    v_mul_hi_u32 v11, v11, v2
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; CGP-NEXT:    v_mul_lo_u32 v12, v14, v16
-; CGP-NEXT:    v_mul_lo_u32 v15, v2, v11
-; CGP-NEXT:    v_mul_hi_u32 v13, v2, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v14, v16
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v13, v14, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v2, v11
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
-; CGP-NEXT:    v_mul_hi_u32 v11, v14, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v11, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v10, v14
+; CGP-NEXT:    v_mul_lo_u32 v13, v2, v11
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v10, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v15, v10, v11
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v11
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_mul_hi_u32 v11, v10, v11
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, 0, v10, vcc
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v11, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v5, v2
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v3, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v5, v2
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
@@ -784,7 +774,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v10
+; CGP-NEXT:    v_mul_hi_u32 v12, v3, v10
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
@@ -796,42 +786,42 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_mul_lo_u32 v11, v0, v2
-; CGP-NEXT:    v_mul_lo_u32 v10, v3, v10
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v3, v2
+; CGP-NEXT:    v_mul_lo_u32 v10, v1, v10
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v12
 ; CGP-NEXT:    v_subb_u32_e64 v10, s[4:5], v5, v2, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v5, v2
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v0
 ; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v4, v3
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v3, v1
 ; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v0
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v11, v3
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v11, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v11, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v11, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v1
-; CGP-NEXT:    v_xor_b32_e32 v3, v0, v1
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v2, v1
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_xor_b32_e32 v2, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v2, v4, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr4
 ; CGP-NEXT:    ; implicit-def: $vgpr10
 ; CGP-NEXT:  BB2_2: ; %Flow2
@@ -870,35 +860,35 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v7
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v6, v2
 ; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v7, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v5, v3, v2
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v2
 ; CGP-NEXT:    v_xor_b32_e32 v2, v4, v2
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v5
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
-; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v9
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v5, v2
+; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v9
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v3
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v9, v3, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v6
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v9, v6, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v4
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v5
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
 ; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v10, v4
 ; CGP-NEXT:    v_mul_lo_u32 v12, v9, v8
 ; CGP-NEXT:    v_mul_hi_u32 v14, v9, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v9, v4
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v3
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v6
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; CGP-NEXT:    v_mul_lo_u32 v12, v8, v13
 ; CGP-NEXT:    v_mul_lo_u32 v14, v4, v11
 ; CGP-NEXT:    v_mul_hi_u32 v15, v4, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
-; CGP-NEXT:    v_xor_b32_e32 v7, v7, v3
+; CGP-NEXT:    v_xor_b32_e32 v7, v7, v6
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
@@ -917,41 +907,39 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_addc_u32_e64 v12, s[4:5], v8, v11, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v11, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v10, v10, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v14, v9, v4
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, v8
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v4
 ; CGP-NEXT:    v_mul_hi_u32 v9, v9, v4
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
-; CGP-NEXT:    v_mul_lo_u32 v10, v12, v14
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v12, v14
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v9
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v9, v12, v9
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_lo_u32 v10, v8, v12
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v8
-; CGP-NEXT:    v_mul_hi_u32 v11, v6, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
@@ -959,7 +947,7 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v7, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v6, v8
+; CGP-NEXT:    v_mul_hi_u32 v10, v5, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
@@ -971,42 +959,42 @@ define <2 x i64> @v_srem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v4
-; CGP-NEXT:    v_mul_lo_u32 v8, v5, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v5, v4
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, v8
+; CGP-NEXT:    v_mul_lo_u32 v10, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT:    v_sub_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v10
 ; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v7, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v7, v4
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v5
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v2
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v2, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v6, v5
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v5, v3
 ; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v2
 ; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v5
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v3
-; CGP-NEXT:    v_xor_b32_e32 v5, v2, v3
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v4, v3
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
+; CGP-NEXT:    v_xor_b32_e32 v4, v2, v6
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v3, v6
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v4, v6, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr6
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB2_6: ; %Flow
@@ -1043,32 +1031,62 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-LABEL: v_srem_i64_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x1000
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
-; CHECK-NEXT:    s_movk_i32 s6, 0xf000
-; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
+; CHECK-NEXT:    s_movk_i32 s4, 0xf000
+; CHECK-NEXT:    s_movk_i32 s6, 0x1000
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
 ; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -1076,7 +1094,7 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1087,44 +1105,12 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_addc_u32_e64 v6, s[4:5], v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v3
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v10
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v9
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CHECK-NEXT:    s_movk_i32 s6, 0x1000
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v7
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
@@ -1132,31 +1118,31 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v3
+; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v4, s6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, s6, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v3
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
 ; CHECK-NEXT:    v_subrev_i32_e32 v5, vcc, s6, v0
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
@@ -1170,13 +1156,13 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 4096
   ret i64 %result
@@ -1197,13 +1183,13 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
-; GISEL-NEXT:    s_sub_u32 s11, 0, s8
-; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
-; GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GISEL-NEXT:    s_sub_u32 s4, 0, s8
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s12, 0, s9
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_subb_u32 s5, 0, s9
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
@@ -1212,10 +1198,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
@@ -1242,39 +1228,37 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v5
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v5
@@ -1337,14 +1321,14 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
-; GISEL-NEXT:    s_sub_u32 s8, 0, s6
-; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
-; GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GISEL-NEXT:    s_sub_u32 s4, 0, s6
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GISEL-NEXT:    s_subb_u32 s9, 0, s7
+; GISEL-NEXT:    s_subb_u32 s5, 0, s7
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
@@ -1353,13 +1337,13 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1387,39 +1371,37 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v5
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
@@ -1487,32 +1469,31 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x1000
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
 ; CGP-NEXT:    s_movk_i32 s6, 0xf000
+; CGP-NEXT:    s_movk_i32 s7, 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v4, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v7, v5
-; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
-; CGP-NEXT:    s_movk_i32 s7, 0x1000
+; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
 ; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
@@ -1531,40 +1512,39 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT:    v_addc_u32_e64 v10, s[4:5], v8, v9, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, -1, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, s6, v7
-; CGP-NEXT:    v_mul_lo_u32 v13, s6, v7
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v14, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v12
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v1, v7
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_mul_hi_u32 v12, v7, v9
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
@@ -1655,40 +1635,38 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v7, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, s6, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, s6, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v5
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v11, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v8, v5, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v8, -1, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
+; CGP-NEXT:    v_mul_hi_u32 v11, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v5
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v10
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v7, v10
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
@@ -1758,32 +1736,62 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_srem_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
-; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
-; CHECK-NEXT:    s_mov_b32 s6, 0xffed2705
-; CHECK-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
-; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
-; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
-; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
+; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
+; CHECK-NEXT:    s_mov_b32 s4, 0xffed2705
+; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
+; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; CHECK-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v4
-; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v4
-; CHECK-NEXT:    v_mul_hi_u32 v8, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v3
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, -1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v4
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
 ; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
@@ -1791,7 +1799,7 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v2, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -1802,44 +1810,12 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_addc_u32_e64 v6, s[4:5], v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v3
-; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v6
-; CHECK-NEXT:    v_mul_hi_u32 v10, s6, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, s6, v3
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v5
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v10
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v9
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v7
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v9
-; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
-; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v7
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v7
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CHECK-NEXT:    v_mul_hi_u32 v6, v6, v7
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v8, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
+; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
@@ -1847,31 +1823,31 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v3
+; CHECK-NEXT:    v_mul_lo_u32 v5, 0, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v4, s6, v4
-; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, s6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, s6, v2
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v3
+; CHECK-NEXT:    v_subb_u32_e64 v4, s[4:5], v1, v2, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
 ; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[4:5]
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[4:5]
 ; CHECK-NEXT:    v_subrev_i32_e32 v5, vcc, s6, v0
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
 ; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
@@ -1885,13 +1861,13 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CHECK-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
-; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v2
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v3
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v3
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 1235195
   ret i64 %result
@@ -1912,13 +1888,13 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[6:7]
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
-; GISEL-NEXT:    s_sub_u32 s11, 0, s8
-; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
-; GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GISEL-NEXT:    s_sub_u32 s4, 0, s8
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s12, 0, s9
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
+; GISEL-NEXT:    s_subb_u32 s5, 0, s9
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
@@ -1927,10 +1903,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s11, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, s11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, s11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
@@ -1957,39 +1933,37 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s11, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, s11, v5
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v1, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v0, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v5
@@ -2052,14 +2026,14 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s7
-; GISEL-NEXT:    s_sub_u32 s8, 0, s6
-; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
-; GISEL-NEXT:    s_and_b32 s4, s4, 1
+; GISEL-NEXT:    s_sub_u32 s4, 0, s6
+; GISEL-NEXT:    s_cselect_b32 s5, 1, 0
+; GISEL-NEXT:    s_and_b32 s5, s5, 1
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
+; GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
-; GISEL-NEXT:    s_subb_u32 s9, 0, s7
+; GISEL-NEXT:    s_subb_u32 s5, 0, s7
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
 ; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
@@ -2068,13 +2042,13 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v7, s9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, s8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
-; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
@@ -2102,39 +2076,37 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_addc_u32_e64 v8, s[4:5], v6, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s9, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v8
-; GISEL-NEXT:    v_mul_hi_u32 v12, s8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, s8, v5
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, s4, v6
+; GISEL-NEXT:    v_mul_hi_u32 v10, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v12, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v5, v9
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v10
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v2, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v5
@@ -2202,32 +2174,62 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
 ; CGP-NEXT:    s_mov_b32 s6, 0xffed2705
+; CGP-NEXT:    s_mov_b32 s7, 0x12d8fb
+; CGP-NEXT:    v_mov_b32_e32 v4, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v4
 ; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v7, v5
-; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v7
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_mul_hi_u32 v12, v7, v9
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, -1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, s6, v8
 ; CGP-NEXT:    v_mul_hi_u32 v12, s6, v7
 ; CGP-NEXT:    v_mul_lo_u32 v11, s6, v7
-; CGP-NEXT:    s_mov_b32 s7, 0x12d8fb
+; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_mul_lo_u32 v10, v8, v11
 ; CGP-NEXT:    v_mul_lo_u32 v12, v7, v9
 ; CGP-NEXT:    v_mul_hi_u32 v13, v7, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CGP-NEXT:    s_bfe_i32 s8, -1, 0x10000
+; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
@@ -2246,39 +2248,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; CGP-NEXT:    v_addc_u32_e64 v10, s[4:5], v8, v9, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, -1, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, s6, v7
-; CGP-NEXT:    v_mul_lo_u32 v13, s6, v7
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v10, v13
-; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v12, v10, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v14, v9
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v12
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, v1, v7
 ; CGP-NEXT:    v_mul_lo_u32 v10, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v11, v0, v7
@@ -2370,40 +2340,38 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v7, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, -1, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, s6, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, s6, v5
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v5
-; CGP-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_mul_lo_u32 v11, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v10
-; CGP-NEXT:    v_mul_hi_u32 v8, v5, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v9, v12
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v8, -1, v5
+; CGP-NEXT:    v_mul_lo_u32 v9, s6, v7
+; CGP-NEXT:    v_mul_hi_u32 v11, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v10, s6, v5
 ; CGP-NEXT:    v_xor_b32_e32 v2, v2, v6
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v11, v9, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v13, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v5, v10
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_mul_hi_u32 v9, v9, v10
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v5, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v7, v10
 ; CGP-NEXT:    v_xor_b32_e32 v3, v3, v6
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v7
@@ -2488,40 +2456,40 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v0, 31, v6
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v5, v0
 ; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v6, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v5, v1, v0
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v2, v0
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v5
-; CHECK-NEXT:    v_cvt_f32_u32_e32 v6, v0
-; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v1
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v6
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CHECK-NEXT:    v_cvt_f32_u32_e32 v5, v0
+; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v4
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v5
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v5
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v2
-; CHECK-NEXT:    v_trunc_f32_e32 v6, v6
-; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v6
+; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
+; CHECK-NEXT:    v_trunc_f32_e32 v5, v5
+; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CHECK-NEXT:    v_subb_u32_e32 v8, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v8, v2
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v6
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v7, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v2
-; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v1
+; CHECK-NEXT:    v_xor_b32_e32 v3, v3, v6
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v11
+; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v11
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v2, v9
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v11
-; CHECK-NEXT:    v_mul_hi_u32 v11, v6, v11
-; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v1
+; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v11
+; CHECK-NEXT:    v_xor_b32_e32 v4, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v13, v6, v9
+; CHECK-NEXT:    v_mul_lo_u32 v13, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v9
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
@@ -2529,102 +2497,100 @@ define i64 @v_srem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v9
+; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; CHECK-NEXT:    v_addc_u32_e64 v10, s[4:5], v6, v9, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v9, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v8, v2
-; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v10
-; CHECK-NEXT:    v_mul_lo_u32 v12, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v9, v7, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v2
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v9
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, v10, v12
-; CHECK-NEXT:    v_mul_lo_u32 v11, v2, v7
-; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v12
-; CHECK-NEXT:    v_mul_hi_u32 v12, v10, v12
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_mul_lo_u32 v9, v10, v7
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v7
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; CHECK-NEXT:    v_mul_hi_u32 v7, v10, v7
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v10
+; CHECK-NEXT:    v_mul_lo_u32 v9, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v10
+; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v10
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v11, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v7
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v6
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v4, v2
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v6
+; CHECK-NEXT:    v_mul_lo_u32 v9, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v6
+; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, v5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v3, vcc, v3, v8
-; CHECK-NEXT:    v_subb_u32_e64 v6, s[4:5], v4, v2, vcc
+; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v4, v2, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v2, s[4:5], v4, v2
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v6, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v5
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, v0
+; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v0
 ; CHECK-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v3, v5
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v3, v1
 ; CHECK-NEXT:    v_subbrev_u32_e64 v8, s[4:5], 0, v2, vcc
 ; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v5
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v0
 ; CHECK-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v2, vcc, v7, v5
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v7, v1
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, v9, v10, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CHECK-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v2, v2, v1
-; CHECK-NEXT:    v_xor_b32_e32 v3, v0, v1
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v2, v1
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT:    v_xor_b32_e32 v2, v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v1, v6
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v2, v6, vcc
 ; CHECK-NEXT:    ; implicit-def: $vgpr5_vgpr6
 ; CHECK-NEXT:    ; implicit-def: $vgpr3
 ; CHECK-NEXT:  BB7_2: ; %Flow
@@ -2662,8 +2628,8 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b64 s[6:7], 0x1000
-; GISEL-NEXT:    v_lshl_b64 v[4:5], s[6:7], v4
+; GISEL-NEXT:    s_mov_b64 s[4:5], 0x1000
+; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
@@ -2675,108 +2641,106 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v9
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
-; GISEL-NEXT:    v_xor_b32_e32 v9, v0, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v7
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v8
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v7
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v0
-; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v7
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v9
+; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v7
+; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v7
+; GISEL-NEXT:    v_xor_b32_e32 v16, v0, v4
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v14
-; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v12
-; GISEL-NEXT:    v_mul_hi_u32 v1, v0, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v14
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v14
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v12
+; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v9, v14
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v12
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v15, v1
-; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v12
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v15, v7, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v12
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT:    v_addc_u32_e64 v1, s[4:5], v7, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v1
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v9, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v10, v7
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v0
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v12
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v14
-; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v1, v14
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v12, v1, v10
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v10
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v1, v1, v10
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v12
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v10
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v0, v11
-; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v7
-; GISEL-NEXT:    v_mul_lo_u32 v12, v9, v10
-; GISEL-NEXT:    v_lshl_b64 v[0:1], s[6:7], v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v16, v7
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v4
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v12
+; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v1, v0, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v12
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
+; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v1
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v7
+; GISEL-NEXT:    v_lshl_b64 v[0:1], s[4:5], v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v16, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v10
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
+; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
+; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v7
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v6
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v6
 ; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v9, v11
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v16, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v16, v6
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v16, v10
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v13, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v13, v6
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v8
@@ -2816,115 +2780,113 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v2, v9
 ; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v3, v9, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT:    v_xor_b32_e32 v3, v1, v9
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v1, v1
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
+; GISEL-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v3, v3
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, 0, v8
 ; GISEL-NEXT:    v_subb_u32_e32 v11, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v1
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v10, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v0
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
+; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v9
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v14
+; GISEL-NEXT:    v_mul_lo_u32 v13, v3, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v12
-; GISEL-NEXT:    v_mul_hi_u32 v16, v0, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v1, v14
+; GISEL-NEXT:    v_mul_hi_u32 v1, v0, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v3, v14
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v16, v1, v12
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v3, v12
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v15, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v12
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
-; GISEL-NEXT:    v_mul_hi_u32 v12, v1, v12
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT:    v_mul_hi_u32 v12, v3, v12
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], v1, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v13
-; GISEL-NEXT:    v_mul_lo_u32 v15, v10, v0
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v3, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v10, v1
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v0
-; GISEL-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v15
-; GISEL-NEXT:    v_mul_lo_u32 v14, v0, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v13, v15
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v10
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v10
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v15, v14
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v10
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v10, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v11
-; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v10
-; GISEL-NEXT:    v_mul_lo_u32 v13, v3, v11
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v12
+; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v1, v12
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v3
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v11
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v10
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v1, v3, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v2, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, v16, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v6, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v5, v4, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v10
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v4, v16, v10
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v2, v11
+; GISEL-NEXT:    v_mul_lo_u32 v5, v2, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v10
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v11
+; GISEL-NEXT:    v_mul_hi_u32 v6, v16, v3
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v11
+; GISEL-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v3, v8, v3
+; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v10
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v2, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v4
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v16, v6
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v2, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v5, v7
 ; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v7, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v3, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, v4, v8
 ; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v2, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
@@ -2938,8 +2900,8 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
 ; GISEL-NEXT:    v_xor_b32_e32 v4, v2, v9
@@ -2968,35 +2930,35 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v0
 ; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v3, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v3, v1, v0
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; CGP-NEXT:    v_xor_b32_e32 v0, v2, v0
-; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v3
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v0
-; CGP-NEXT:    v_ashrrev_i32_e32 v1, 31, v9
-; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v4
+; CGP-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; CGP-NEXT:    v_cvt_f32_u32_e32 v3, v0
+; CGP-NEXT:    v_ashrrev_i32_e32 v4, 31, v9
+; CGP-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, v2
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v1
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v4
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v9, v4, vcc
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v2
 ; CGP-NEXT:    v_trunc_f32_e32 v8, v8
 ; CGP-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v8
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v1
 ; CGP-NEXT:    v_subb_u32_e32 v12, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v12, v2
 ; CGP-NEXT:    v_mul_lo_u32 v14, v9, v8
 ; CGP-NEXT:    v_mul_hi_u32 v16, v9, v2
 ; CGP-NEXT:    v_mul_lo_u32 v15, v9, v2
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v1
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v4
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
 ; CGP-NEXT:    v_mul_lo_u32 v14, v8, v15
 ; CGP-NEXT:    v_mul_lo_u32 v16, v2, v13
 ; CGP-NEXT:    v_mul_hi_u32 v17, v2, v15
 ; CGP-NEXT:    v_mul_hi_u32 v15, v8, v15
-; CGP-NEXT:    v_xor_b32_e32 v6, v6, v1
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v4
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
@@ -3015,41 +2977,39 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v15, vcc, v16, v15
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v14
-; CGP-NEXT:    v_addc_u32_e64 v14, s[4:5], v8, v13, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v12, v2
-; CGP-NEXT:    v_mul_lo_u32 v15, v9, v14
-; CGP-NEXT:    v_mul_lo_u32 v16, v9, v2
+; CGP-NEXT:    v_mul_lo_u32 v13, v9, v8
+; CGP-NEXT:    v_mul_lo_u32 v14, v9, v2
 ; CGP-NEXT:    v_mul_hi_u32 v9, v9, v2
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; CGP-NEXT:    v_mul_lo_u32 v12, v14, v16
-; CGP-NEXT:    v_mul_lo_u32 v15, v2, v9
-; CGP-NEXT:    v_mul_hi_u32 v13, v2, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v14, v16
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v13, v14, v9
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v15, v12
-; CGP-NEXT:    v_mul_hi_u32 v15, v2, v9
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
-; CGP-NEXT:    v_mul_hi_u32 v9, v14, v9
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v15, v13
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v8, v14
+; CGP-NEXT:    v_mul_lo_u32 v13, v2, v9
+; CGP-NEXT:    v_mul_hi_u32 v15, v2, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v8, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v15, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v9
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
+; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v9, v6, v2
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
+; CGP-NEXT:    v_mul_hi_u32 v13, v3, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v6, v2
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
@@ -3057,7 +3017,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v13, v6, v8
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v8
+; CGP-NEXT:    v_mul_hi_u32 v12, v3, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
@@ -3069,42 +3029,42 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CGP-NEXT:    v_mul_lo_u32 v9, v0, v2
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, v8
-; CGP-NEXT:    v_mul_lo_u32 v12, v3, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v3, v2
+; CGP-NEXT:    v_mul_lo_u32 v8, v1, v8
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v12
 ; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v6, v2, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v2, s[4:5], v6, v2
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v3, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v0
 ; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v2, v0, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v4, v3
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v3, v1
 ; CGP-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v0
 ; CGP-NEXT:    v_subb_u32_e32 v0, vcc, v2, v0, vcc
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v9, v3
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v9, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, v13, v14, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v12, v0, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v1
-; CGP-NEXT:    v_xor_b32_e32 v3, v0, v1
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v2, v1
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; CGP-NEXT:    v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT:    v_xor_b32_e32 v2, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v2, v4, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; CGP-NEXT:    ; implicit-def: $vgpr8
 ; CGP-NEXT:  BB8_2: ; %Flow2
@@ -3143,40 +3103,40 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v11
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v10, v2
 ; CGP-NEXT:    v_addc_u32_e32 v4, vcc, v11, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v6, v3, v2
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v2
 ; CGP-NEXT:    v_xor_b32_e32 v2, v4, v2
-; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v6
-; CGP-NEXT:    v_cvt_f32_u32_e32 v8, v2
-; CGP-NEXT:    v_ashrrev_i32_e32 v3, 31, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v3
-; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v8
+; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; CGP-NEXT:    v_cvt_f32_u32_e32 v6, v2
+; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v6
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, 0, v3
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v8, v8
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v8
+; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
+; CGP-NEXT:    v_trunc_f32_e32 v7, v7
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_subb_u32_e32 v10, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v10, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v8
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v7
 ; CGP-NEXT:    v_mul_hi_u32 v14, v9, v4
 ; CGP-NEXT:    v_mul_lo_u32 v13, v9, v4
-; CGP-NEXT:    v_xor_b32_e32 v5, v5, v3
+; CGP-NEXT:    v_xor_b32_e32 v5, v5, v8
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; CGP-NEXT:    v_mul_lo_u32 v12, v8, v13
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, v13
 ; CGP-NEXT:    v_mul_lo_u32 v14, v4, v11
 ; CGP-NEXT:    v_mul_hi_u32 v15, v4, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
-; CGP-NEXT:    v_xor_b32_e32 v7, v7, v3
+; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
+; CGP-NEXT:    v_xor_b32_e32 v6, v6, v8
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v8, v11
+; CGP-NEXT:    v_mul_lo_u32 v15, v7, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
 ; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
@@ -3184,102 +3144,100 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_addc_u32_e64 v12, s[4:5], v8, v11, vcc
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v10, v10, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v14, v9, v4
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, v7
+; CGP-NEXT:    v_mul_lo_u32 v12, v9, v4
 ; CGP-NEXT:    v_mul_hi_u32 v9, v9, v4
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v9
-; CGP-NEXT:    v_mul_lo_u32 v10, v12, v14
-; CGP-NEXT:    v_mul_lo_u32 v13, v4, v9
-; CGP-NEXT:    v_mul_hi_u32 v11, v4, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v12, v14
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v11, v12, v9
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v13, v10
-; CGP-NEXT:    v_mul_hi_u32 v13, v4, v9
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; CGP-NEXT:    v_mul_hi_u32 v9, v12, v9
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v9, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_mul_lo_u32 v10, v7, v12
+; CGP-NEXT:    v_mul_lo_u32 v11, v4, v9
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v7, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v7, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v9
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, 0, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v5, v8
+; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v6, v4
+; CGP-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; CGP-NEXT:    v_mul_hi_u32 v11, v5, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v6, v4
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v7, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v6, v7
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_hi_u32 v10, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v10, v5, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v8, v7, v8
+; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CGP-NEXT:    v_mul_lo_u32 v9, v2, v4
-; CGP-NEXT:    v_mul_lo_u32 v8, v6, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; CGP-NEXT:    v_mul_lo_u32 v7, v3, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; CGP-NEXT:    v_sub_i32_e32 v5, vcc, v5, v10
-; CGP-NEXT:    v_subb_u32_e64 v8, s[4:5], v7, v4, vcc
-; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v7, v4
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v6
+; CGP-NEXT:    v_subb_u32_e64 v7, s[4:5], v6, v4, vcc
+; CGP-NEXT:    v_sub_i32_e64 v4, s[4:5], v6, v4
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v7, v2
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v8, v2
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v7, v2
 ; CGP-NEXT:    v_subb_u32_e32 v4, vcc, v4, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v5, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v5, v3
 ; CGP-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v4, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v6
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], v10, v2
 ; CGP-NEXT:    v_subb_u32_e32 v2, vcc, v4, v2, vcc
-; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v9, v6
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v9, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; CGP-NEXT:    v_xor_b32_e32 v4, v4, v3
-; CGP-NEXT:    v_xor_b32_e32 v5, v2, v3
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v4, v3
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v5, v3, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
+; CGP-NEXT:    v_xor_b32_e32 v4, v2, v8
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v3, v8
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v4, v8, vcc
 ; CGP-NEXT:    ; implicit-def: $vgpr10_vgpr11
 ; CGP-NEXT:    ; implicit-def: $vgpr5
 ; CGP-NEXT:  BB8_6: ; %Flow
@@ -3377,90 +3335,88 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v1
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, v3
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v1
-; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, 0, v1
+; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_and_b32_e32 v5, s6, v0
-; GISEL-NEXT:    v_and_b32_e32 v0, s6, v2
 ; GISEL-NEXT:    v_and_b32_e32 v6, s6, v6
-; GISEL-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v2
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v4
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v0
 ; GISEL-NEXT:    v_trunc_f32_e32 v4, v4
-; GISEL-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v0
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v2
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v4
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v2
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v2
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v4, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v2, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v11
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0, v5
-; GISEL-NEXT:    v_addc_u32_e64 v13, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v9
+; GISEL-NEXT:    v_mul_lo_u32 v0, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v12
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v7, v10
+; GISEL-NEXT:    v_and_b32_e32 v0, s6, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, v7, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v11, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v4, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v2
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v2
-; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v2
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v8, v10, v12
-; GISEL-NEXT:    v_mul_lo_u32 v11, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v10, v12
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v7
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v9
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v4, v10, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v2
+; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v4
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v2
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v2
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v10
+; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v10
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 0, v5
+; GISEL-NEXT:    v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v9, v2, v7
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v11, v2
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v2
-; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v2
+; GISEL-NEXT:    v_mul_hi_u32 v2, v11, v2
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v13, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v4
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
@@ -3468,7 +3424,7 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v4, v13, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
@@ -3480,8 +3436,8 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v5, v8
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v13, v2, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v13, v2
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v11, v2, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v11, v2
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v1
@@ -3492,11 +3448,9 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v4, v1
 ; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v2, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v1
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v2, vcc, v2, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v9, v3
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0, v6
@@ -3504,116 +3458,116 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_addc_u32_e64 v6, s[4:5], 0, 0, vcc
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v3
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v6
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v11, 0x4f800000, v12
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v8, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v8
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v5, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v4
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, 0, v3
-; GISEL-NEXT:    v_subb_u32_e32 v8, vcc, 0, v6, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v4
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v9
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 0, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v4, v11
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], 0, v3
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], 0, v6, s[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v4
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v8, v4
+; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v11
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v4, v10
+; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v10
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v13, v5
+; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT:    v_addc_u32_e64 v4, s[4:5], v5, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v8, v0
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v0
-; GISEL-NEXT:    v_mul_hi_u32 v7, v7, v0
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v8, v4, v11
-; GISEL-NEXT:    v_mul_lo_u32 v10, v0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v9, v0, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v4, v11
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v7, v10, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v4
+; GISEL-NEXT:    v_mul_lo_u32 v9, v8, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
+; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v4
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v10
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v4, v7
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v0, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; GISEL-NEXT:    v_mul_hi_u32 v4, v4, v7
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v9
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v5, v4, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v0, v8
-; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v4
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v4, v10
+; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], 0, 0, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v5
 ; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0, v1
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v2, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v2, v13, v5
+; GISEL-NEXT:    v_mul_hi_u32 v2, v11, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v14, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v14, v5
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v5
+; GISEL-NEXT:    v_mul_hi_u32 v4, v12, v4
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; GISEL-NEXT:    v_mul_hi_u32 v8, v13, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT:    v_mul_hi_u32 v4, v14, v4
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v6, v2
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v3, v2
 ; GISEL-NEXT:    v_mul_hi_u32 v2, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v13, v7
-; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v14, v2, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v14, v2
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v11, v7
+; GISEL-NEXT:    v_subb_u32_e64 v5, s[4:5], v12, v2, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v12, v2
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v5, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index ac9c1576d02e..98b46a09b114 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -58,38 +58,36 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v1, v8, vcc
-; CHECK-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v0
-; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, v6, v9
-; CHECK-NEXT:    v_mul_lo_u32 v11, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
+; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v0
+; CHECK-NEXT:    v_mul_lo_u32 v6, v6, v1
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v8
+; CHECK-NEXT:    v_mul_hi_u32 v11, v0, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v1, v8
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v6
-; CHECK-NEXT:    v_mul_hi_u32 v13, v0, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v6
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v10, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v7
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v11
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v1, v6
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v5, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
@@ -195,24 +193,24 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s3
-; CHECK-NEXT:    s_sub_u32 s6, 0, s2
-; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_sub_u32 s4, 0, s2
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v2
-; CHECK-NEXT:    s_and_b32 s4, s4, 1
+; CHECK-NEXT:    s_and_b32 s5, s5, 1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
-; CHECK-NEXT:    s_subb_u32 s7, 0, s3
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_subb_u32 s5, 0, s3
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT:    v_mul_lo_u32 v4, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, s6, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, s7, v0
-; CHECK-NEXT:    v_mul_hi_u32 v7, s6, v0
+; CHECK-NEXT:    v_mul_lo_u32 v4, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, s4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v0
+; CHECK-NEXT:    v_mul_hi_u32 v7, s4, v0
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v2, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v5
@@ -237,38 +235,36 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_addc_u32_e64 v5, s[4:5], v2, v4, vcc
-; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v4
-; CHECK-NEXT:    v_mul_lo_u32 v4, s6, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, s7, v0
-; CHECK-NEXT:    v_mul_hi_u32 v7, s6, v0
-; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v5, v4
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v6
-; CHECK-NEXT:    v_mul_hi_u32 v11, v0, v6
-; CHECK-NEXT:    v_mul_hi_u32 v5, v5, v6
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v9, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v8, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v9
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v6
-; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v2, v5, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v4, s4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v5, s5, v0
+; CHECK-NEXT:    v_mul_hi_u32 v6, s4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v4
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
+; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v5
+; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v2, v5, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v4, s1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v5, s0, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, s1, v0
@@ -409,38 +405,36 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], v9, v13, vcc
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v8
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v8
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v8
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v13
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v13
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v13
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v10
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v10
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v9, vcc, 0, v9, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v8
@@ -536,38 +530,36 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v11, vcc
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v11
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v4
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v11, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v4
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v11
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v8
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v8
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v5, vcc, 0, v5, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
@@ -680,38 +672,36 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v1, v12, vcc
-; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v2, v0
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v0
-; CGP-NEXT:    v_mul_hi_u32 v14, v2, v0
-; CGP-NEXT:    v_mul_lo_u32 v2, v2, v13
-; CGP-NEXT:    v_mul_lo_u32 v15, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v16, v0, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v13, v12
-; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v3, v2
-; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v14
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v0, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v1, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
 ; CGP-NEXT:    v_mul_lo_u32 v3, v0, v2
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v13, v2
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v15, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v13, v3
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v15
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v12, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v12
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v2
+; CGP-NEXT:    v_mul_hi_u32 v16, v0, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v2, v11, v0
 ; CGP-NEXT:    v_mul_hi_u32 v3, v10, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v11, v0
@@ -844,38 +834,36 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT:    v_addc_u32_e64 v11, s[4:5], v3, v10, vcc
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v10
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v10, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v2
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v11
-; CGP-NEXT:    v_mul_lo_u32 v13, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, v2, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v10
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v3, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_mul_lo_u32 v5, v2, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v11, v4
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v11, v5
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v13
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v9, v2
 ; CGP-NEXT:    v_mul_hi_u32 v5, v8, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v9, v2
@@ -1130,38 +1118,36 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v1, v8, vcc
-; CHECK-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v0
-; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v0
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v9
-; CHECK-NEXT:    v_mul_lo_u32 v11, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v7, v2
-; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v10
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v1
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v8
+; CHECK-NEXT:    v_mul_hi_u32 v11, v0, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v1, v8
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v2
-; CHECK-NEXT:    v_mul_hi_u32 v13, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v9, v2
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v10, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v7
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v11
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v8
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
@@ -1299,38 +1285,36 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], v9, v13, vcc
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v6
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v6
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v6
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v13
-; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v13
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v13
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v10
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v10
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v9, vcc, 0, v9, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v6
@@ -1426,38 +1410,36 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v7, v11, vcc
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v11
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v6
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v11
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v8
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v8
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v7, vcc, 0, v7, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v6
@@ -1573,38 +1555,36 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v1, v12, vcc
-; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v4, v0
 ; CGP-NEXT:    v_mul_lo_u32 v6, v6, v0
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v0
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v13
-; CGP-NEXT:    v_mul_lo_u32 v15, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v16, v0, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v13, v12
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v6, v4
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v14
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v0
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v0, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v1, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
 ; CGP-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v4
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v13, v4
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v15, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v15
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v12, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v4
+; CGP-NEXT:    v_mul_hi_u32 v16, v0, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
 ; CGP-NEXT:    v_mul_hi_u32 v6, v8, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
@@ -1737,38 +1717,36 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v3, v8, vcc
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v8
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v6, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v8
-; CGP-NEXT:    v_mul_hi_u32 v14, v2, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v9, v8
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v6, v4
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_mul_lo_u32 v6, v2, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v9, v4
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v12, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v9, v6
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v13
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v8, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v7, v2
 ; CGP-NEXT:    v_mul_hi_u32 v6, v5, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v7, v2
@@ -1987,171 +1965,167 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
-; GISEL-NEXT:    s_bfe_i32 s10, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s11, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s12, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s13, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s5, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s7, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; GISEL-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GISEL-NEXT:    v_and_b32_e32 v2, s6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT:    v_mov_b32_e32 v16, s4
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v18
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT:    v_mov_b32_e32 v19, s5
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_mov_b32_e32 v15, s7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v8, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v4, v6
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v6
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v17
-; GISEL-NEXT:    v_addc_u32_e64 v16, s[6:7], v11, v13, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v6
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v18, v9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v16
-; GISEL-NEXT:    v_mul_lo_u32 v19, v16, v17
-; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v17
-; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v9
-; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], v19, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[6:7], v18, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v14
-; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v15
-; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v15
-; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v4, v10
+; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v8
+; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
+; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
+; GISEL-NEXT:    v_mul_lo_u32 v17, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v18, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[8:9], v10, v5
-; GISEL-NEXT:    v_mov_b32_e32 v5, s10
-; GISEL-NEXT:    v_mov_b32_e32 v10, s11
-; GISEL-NEXT:    v_add_i32_e64 v8, s[10:11], v8, v12
-; GISEL-NEXT:    v_mov_b32_e32 v12, s12
-; GISEL-NEXT:    v_add_i32_e64 v11, s[10:11], v11, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v17, s[8:9], v18, v17
-; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v4
-; GISEL-NEXT:    v_mul_hi_u32 v14, v14, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v6, v4
-; GISEL-NEXT:    v_add_i32_e64 v13, s[8:9], v18, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[8:9], v18, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], v19, v18
-; GISEL-NEXT:    v_mul_lo_u32 v19, v16, v9
-; GISEL-NEXT:    v_mul_hi_u32 v16, v16, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v19, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v15, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v19, v15
-; GISEL-NEXT:    v_mov_b32_e32 v19, s13
-; GISEL-NEXT:    v_add_i32_e64 v4, s[6:7], v4, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v13, v17
-; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v15, v18
-; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v14, v13
-; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], v16, v15
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v11, v14, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v18, v2, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
+; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v4
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
+; GISEL-NEXT:    v_mul_lo_u32 v18, v11, v9
+; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v12, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v14
+; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_mul_lo_u32 v11, 0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, 0, v9
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v16, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; GISEL-NEXT:    v_mov_b32_e32 v18, s8
+; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v14
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, 0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v11, v9, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, 0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
+; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v17, v2, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_mul_lo_u32 v10, 0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, 0, v9
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v16, v1, v4
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_mul_lo_u32 v14, v3, v7
-; GISEL-NEXT:    v_mul_lo_u32 v17, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v18, v3, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v9, v1, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, 1, v4
-; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v6, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v16
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_subb_u32_e64 v13, s[4:5], 0, v9, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v14
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v12, 0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v13, v1, v5
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v3, v7
+; GISEL-NEXT:    v_mul_lo_u32 v14, 0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v17, v3, v7
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v10
+; GISEL-NEXT:    v_mul_lo_u32 v9, v1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v6
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v5
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], 0, v9, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v13
-; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], 1, v7
-; GISEL-NEXT:    v_addc_u32_e64 v18, s[6:7], 0, v8, s[6:7]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v14
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[8:9], 0, v11, s[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v16, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT:    v_add_i32_e64 v8, s[6:7], 1, v7
+; GISEL-NEXT:    v_addc_u32_e64 v17, s[6:7], 0, v6, s[6:7]
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v11
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[8:9], 0, v10, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v16, v13, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v12, v16, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], 1, v15
-; GISEL-NEXT:    v_addc_u32_e64 v16, s[4:5], 0, v17, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v11, s[4:5], 0, v11
-; GISEL-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v15, v16, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v15, s[4:5], 1, v12
+; GISEL-NEXT:    v_addc_u32_e64 v16, s[4:5], 0, v14, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v10, s[4:5], 0, v10
+; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[6:7]
 ; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v3
-; GISEL-NEXT:    v_subbrev_u32_e64 v11, s[4:5], 0, v11, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e64 v10, s[4:5], 0, v10, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v3
-; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 1, v13
-; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, v18, s[6:7]
+; GISEL-NEXT:    v_add_i32_e64 v2, s[6:7], 1, v8
+; GISEL-NEXT:    v_addc_u32_e64 v3, s[6:7], 0, v17, s[6:7]
 ; GISEL-NEXT:    v_sub_i32_e64 v9, s[6:7], 0, v9
 ; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -2160,21 +2134,21 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v10, v0, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v19, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v19, v0, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v18, v1, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v15, v14, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v12, v15, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, v13, v2, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v17, v16, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v18, v3, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v8, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v14, v16, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v17, v3, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, v1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_udiv_v2i64_24bit:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index bd4ecd3a17e5..feb3de3ceaad 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -116,14 +116,14 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s11
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s10
-; GFX8-NEXT:    s_sub_u32 s2, 0, s10
-; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_sub_u32 s0, 0, s10
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_subb_u32 s3, 0, s11
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
@@ -131,15 +131,44 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s11
-; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GFX8-NEXT:    v_mul_lo_u32 v4, s2, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v6, v0, v4
+; GFX8-NEXT:    v_mul_lo_u32 v7, v1, v2
+; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v7, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
+; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s11
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
@@ -160,38 +189,7 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v2, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GFX8-NEXT:    v_mul_hi_u32 v8, s2, v0
-; GFX8-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v2
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v5
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v8
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v7
-; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v7
-; GFX8-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
-; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v7, v5
-; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v4
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s8, v0
@@ -237,25 +235,26 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
-; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v8
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s10, v7
+; GFX8-NEXT:    v_subb_u32_e32 v2, vcc, v2, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX8-NEXT:    v_add_u32_e64 v12, s[0:1], 1, v9
-; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s10, v7
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
+; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v6, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v8, v2, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v3, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v4, v6, s[0:1]
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v8, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v6, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
@@ -269,14 +268,14 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s11
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s10
-; GFX9-NEXT:    s_sub_u32 s2, 0, s10
-; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_sub_u32 s0, 0, s10
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_subb_u32 s3, 0, s11
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -284,10 +283,10 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
 ; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -310,37 +309,35 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GFX9-NEXT:    v_mul_hi_u32 v6, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s9
-; GFX9-NEXT:    v_add3_u32 v4, v4, v5, v6
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v7
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v3, v4
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
-; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add3_u32 v3, v6, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s9, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s8, v0
@@ -396,13 +393,14 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v10, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v11, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v8, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v9, v2, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v5, v7, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v9, v2, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v7, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[4:5]
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[2:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
@@ -413,12 +411,12 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s11
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s10
-; GFX10-NEXT:    s_sub_u32 s1, 0, s10
-; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
-; GFX10-NEXT:    s_and_b32 s0, s0, 1
+; GFX10-NEXT:    s_sub_u32 s0, 0, s10
+; GFX10-NEXT:    s_cselect_b32 s1, 1, 0
+; GFX10-NEXT:    s_and_b32 s1, s1, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    s_subb_u32 s2, 0, s11
+; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10-NEXT:    s_subb_u32 s1, 0, s11
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -427,11 +425,11 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX10-NEXT:    v_mul_lo_u32 v2, s1, v1
+; GFX10-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s2, v0
-; GFX10-NEXT:    v_mul_hi_u32 v4, s1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX10-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v5, s0, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v1, v5
@@ -440,6 +438,33 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v1, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v8, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX10-NEXT:    v_add_co_u32 v3, s2, v3, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s2
+; GFX10-NEXT:    v_add_co_u32 v6, s2, v7, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s2
+; GFX10-NEXT:    v_add_co_u32 v3, s2, v3, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s2
+; GFX10-NEXT:    v_add_co_u32 v5, s2, v6, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v4, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v7, v6
+; GFX10-NEXT:    v_add_co_u32 v3, s2, v5, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s2
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
+; GFX10-NEXT:    v_add3_u32 v2, v4, v5, v2
+; GFX10-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GFX10-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX10-NEXT:    v_mul_lo_u32 v4, s0, v1
+; GFX10-NEXT:    v_mul_hi_u32 v6, v1, v5
+; GFX10-NEXT:    v_add3_u32 v2, v2, v4, v3
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, v5
+; GFX10-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GFX10-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX10-NEXT:    v_mul_lo_u32 v7, v1, v2
+; GFX10-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX10-NEXT:    v_add_co_u32 v3, s0, v3, v4
 ; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v6, s0, v7, v6
@@ -454,38 +479,9 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v3
 ; GFX10-NEXT:    v_add3_u32 v2, v4, v5, v2
-; GFX10-NEXT:    v_mul_lo_u32 v4, s2, v0
-; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, s0, v1, v2, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v7, s1, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v7
-; GFX10-NEXT:    v_add3_u32 v4, v4, v6, v5
-; GFX10-NEXT:    v_mul_lo_u32 v5, v3, v7
-; GFX10-NEXT:    v_mul_hi_u32 v7, v0, v7
-; GFX10-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX10-NEXT:    v_mul_lo_u32 v9, v3, v4
-; GFX10-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GFX10-NEXT:    v_mul_hi_u32 v3, v3, v4
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v8, s0, v9, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v5, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v7, s0, v8, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, v6, v5
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v9, v8
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v7, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add3_u32 v2, v4, v6, v3
+; GFX10-NEXT:    v_mul_hi_u32 v4, s9, v0
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v1, v2, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s9, v0
-; GFX10-NEXT:    v_mul_hi_u32 v4, s9, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s8, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s9, v1
@@ -504,49 +500,50 @@ define amdgpu_kernel void @udivrem_i64(i64 addrspace(1)* %out0, i64 addrspace(1)
 ; GFX10-NEXT:    v_add_co_u32 v0, s0, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s10, v0
+; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v0, 1
 ; GFX10-NEXT:    v_add3_u32 v1, v3, v2, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s11, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v3, s10, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v4, s10, v1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v4, v3
-; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v0, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s9, v2
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, v6, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v7, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s9, v2
 ; GFX10-NEXT:    v_sub_co_u32 v5, vcc_lo, s8, v5
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v7, s0, s9, v2, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v6, vcc_lo
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s0, s9, v2, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v8, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v5, s10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v7
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v5, s10
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v11, s0, 0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v9
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, s11, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s11, v11
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v13, s0, v3, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v4, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v7
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s0
-; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, 0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v9, v2, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v9, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s0
-; GFX10-NEXT:    global_store_dwordx2 v9, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v9, v[2:3], s[6:7]
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v12, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v10, s10
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v12
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v8
+; GFX10-NEXT:    v_mov_b32_e32 v8, 0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v13, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v5, v6, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s1
+; GFX10-NEXT:    global_store_dwordx2 v8, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v8, v[2:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv i64 %x, %y
   store i64 %div, i64 addrspace(1)* %out0
@@ -1037,14 +1034,14 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX8-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX8-NEXT:    s_sub_u32 s2, 0, s8
-; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX8-NEXT:    s_sub_u32 s0, 0, s8
+; GFX8-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX8-NEXT:    s_and_b32 s0, s0, 1
-; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX8-NEXT:    s_subb_u32 s3, 0, s9
+; GFX8-NEXT:    s_and_b32 s1, s1, 1
+; GFX8-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX8-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX8-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1052,15 +1049,45 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s9
-; GFX8-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX8-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX8-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GFX8-NEXT:    v_mul_lo_u32 v4, s2, v0
+; GFX8-NEXT:    s_sub_u32 s2, 0, s10
+; GFX8-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX8-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v0
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
 ; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX8-NEXT:    v_mul_hi_u32 v6, v0, v4
+; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v6, v1, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v5, v3
+; GFX8-NEXT:    v_mul_hi_u32 v5, v0, v2
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v6, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v5
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v6, v5
+; GFX8-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v4, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GFX8-NEXT:    v_mul_lo_u32 v4, s0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s9
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
+; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v4
+; GFX8-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX8-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v5
@@ -1081,39 +1108,7 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v5, v4
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v4
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
-; GFX8-NEXT:    v_addc_u32_e64 v3, s[0:1], v1, v2, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX8-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GFX8-NEXT:    v_mul_hi_u32 v8, s2, v0
-; GFX8-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GFX8-NEXT:    v_add_u32_e64 v1, s[0:1], v1, v2
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v5
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v8
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v7
-; GFX8-NEXT:    v_mul_lo_u32 v8, v0, v4
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v7
-; GFX8-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX8-NEXT:    s_sub_u32 s2, 0, s10
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v8, v0, v4
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v5, s[0:1], v5, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
-; GFX8-NEXT:    v_mul_hi_u32 v3, v3, v4
-; GFX8-NEXT:    v_add_u32_e64 v2, s[0:1], v5, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v7, v5
-; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v4
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
-; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v2, s13, v0
 ; GFX8-NEXT:    v_mul_lo_u32 v3, s12, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v5, s12, v0
@@ -1168,31 +1163,61 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
 ; GFX8-NEXT:    v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1]
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v4, s11
-; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
-; GFX8-NEXT:    v_cvt_f32_u32_e32 v9, s10
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX8-NEXT:    v_add_f32_e32 v4, v4, v9
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v7, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v5, v2, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v3, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v7
-; GFX8-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
-; GFX8-NEXT:    v_trunc_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_f32_e32 v6, 0xcf800000, v3
-; GFX8-NEXT:    v_add_f32_e32 v2, v6, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v13, s11
+; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
+; GFX8-NEXT:    v_cvt_f32_u32_e32 v12, s10
+; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX8-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v13
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
+; GFX8-NEXT:    v_add_f32_e32 v4, v4, v12
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v9, v4
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
+; GFX8-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v9
+; GFX8-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v3
+; GFX8-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX8-NEXT:    v_mul_f32_e32 v7, 0xcf800000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v7, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX8-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX8-NEXT:    s_and_b32 s0, s0, 1
 ; GFX8-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX8-NEXT:    s_subb_u32 s3, 0, s11
+; GFX8-NEXT:    v_mul_lo_u32 v7, s3, v3
+; GFX8-NEXT:    v_mul_lo_u32 v8, s2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v10, s2, v3
+; GFX8-NEXT:    v_mul_lo_u32 v9, s2, v3
+; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
+; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v10
+; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v9
+; GFX8-NEXT:    v_mul_lo_u32 v10, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v5, v2, vcc
+; GFX8-NEXT:    v_mul_hi_u32 v2, v3, v9
+; GFX8-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v7
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v10, v2
+; GFX8-NEXT:    v_mul_hi_u32 v10, v3, v7
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v10
+; GFX8-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v6, v7, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v6, s3, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v7, s2, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v9, s2, v2
@@ -1222,38 +1247,7 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v9, v8
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v7
-; GFX8-NEXT:    v_addc_u32_e64 v7, s[0:1], v3, v6, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v8, s3, v2
-; GFX8-NEXT:    v_mul_lo_u32 v9, s2, v7
-; GFX8-NEXT:    v_mul_hi_u32 v12, s2, v2
-; GFX8-NEXT:    v_mul_lo_u32 v11, s2, v2
-; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v6
-; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v8, v9
-; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v8, v12
-; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v11
-; GFX8-NEXT:    v_mul_lo_u32 v12, v2, v8
-; GFX8-NEXT:    v_mul_hi_u32 v6, v2, v11
-; GFX8-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v9, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v8
-; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v12, v6
-; GFX8-NEXT:    v_mul_hi_u32 v12, v2, v8
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v9, s[0:1], v9, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v11, s[0:1], v11, v12
-; GFX8-NEXT:    v_mul_hi_u32 v7, v7, v8
-; GFX8-NEXT:    v_add_u32_e64 v6, s[0:1], v9, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v11, v9
-; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v6
-; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GFX8-NEXT:    v_addc_u32_e32 v3, vcc, v3, v6, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v6, s15, v2
 ; GFX8-NEXT:    v_mul_lo_u32 v7, s14, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v9, s14, v2
@@ -1333,14 +1327,14 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s8
-; GFX9-NEXT:    s_sub_u32 s2, 0, s8
-; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX9-NEXT:    s_sub_u32 s0, 0, s8
+; GFX9-NEXT:    s_cselect_b32 s1, 1, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX9-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_and_b32 s0, s0, 1
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_subb_u32 s3, 0, s9
+; GFX9-NEXT:    s_and_b32 s1, s1, 1
+; GFX9-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX9-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -1349,11 +1343,13 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v14, s11
-; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX9-NEXT:    v_mul_lo_u32 v3, s3, v0
-; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
+; GFX9-NEXT:    s_sub_u32 s2, 0, s10
+; GFX9-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX9-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v14, 0x4f800000, v14
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_add3_u32 v2, v3, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v2
@@ -1376,41 +1372,40 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, s[0:1], v1, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GFX9-NEXT:    v_mul_hi_u32 v6, s2, v0
-; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v2
-; GFX9-NEXT:    v_add3_u32 v4, v4, v5, v6
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v7
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v7
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v5, v3, v4
-; GFX9-NEXT:    v_add_u32_e32 v2, v6, v2
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v3, v4
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v5, v2
-; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add3_u32 v3, v6, v4, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v1, v2
+; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v5, v6, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v2, v5, v4, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s13, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s13, v0
-; GFX9-NEXT:    v_mov_b32_e32 v7, s13
+; GFX9-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
@@ -1431,7 +1426,6 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v0
-; GFX9-NEXT:    v_mov_b32_e32 v5, s9
 ; GFX9-NEXT:    v_add3_u32 v2, v2, v3, v4
 ; GFX9-NEXT:    v_sub_co_u32_e32 v3, vcc, s12, v6
 ; GFX9-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v7, v2, vcc
@@ -1450,8 +1444,8 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], 1, v0
 ; GFX9-NEXT:    v_addc_co_u32_e64 v10, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_add_f32_e32 v5, v14, v5
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v8
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
@@ -1459,106 +1453,104 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v15, vcc, s8, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v12, s[0:1]
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v2, vcc, 0, v2, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
-; GFX9-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v5
 ; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], 1, v9
-; GFX9-NEXT:    v_trunc_f32_e32 v11, v11
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX9-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v12, 0xcf800000, v11
+; GFX9-NEXT:    v_mul_f32_e32 v12, 0x2f800000, v5
 ; GFX9-NEXT:    v_addc_co_u32_e64 v13, s[0:1], 0, v10, s[0:1]
-; GFX9-NEXT:    v_add_f32_e32 v5, v12, v5
-; GFX9-NEXT:    s_sub_u32 s8, 0, s10
+; GFX9-NEXT:    v_trunc_f32_e32 v12, v12
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v13, 0xcf800000, v12
+; GFX9-NEXT:    v_add_f32_e32 v5, v13, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12
 ; GFX9-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX9-NEXT:    s_and_b32 s0, s0, 1
 ; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX9-NEXT:    s_subb_u32 s9, 0, s11
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v13, s9, v5
-; GFX9-NEXT:    v_mul_lo_u32 v14, s8, v11
-; GFX9-NEXT:    v_mul_hi_u32 v16, s8, v5
-; GFX9-NEXT:    v_mul_lo_u32 v12, s8, v5
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[0:1]
+; GFX9-NEXT:    s_subb_u32 s3, 0, s11
+; GFX9-NEXT:    v_mul_lo_u32 v13, s3, v5
+; GFX9-NEXT:    v_mul_lo_u32 v14, s2, v12
+; GFX9-NEXT:    v_mul_hi_u32 v16, s2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v17, s2, v5
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v9, vcc
 ; GFX9-NEXT:    v_add3_u32 v4, v13, v14, v16
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v10, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v9, v11, v12
-; GFX9-NEXT:    v_mul_lo_u32 v10, v5, v4
-; GFX9-NEXT:    v_mul_hi_u32 v13, v5, v12
-; GFX9-NEXT:    v_mul_hi_u32 v12, v11, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v7, v15, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v9, s[2:3], v9, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e64 v9, s[2:3], v9, v13
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v13, v11, v4
-; GFX9-NEXT:    v_add_u32_e32 v9, v10, v9
-; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v4
-; GFX9-NEXT:    v_mul_hi_u32 v4, v11, v4
-; GFX9-NEXT:    v_add_co_u32_e64 v12, s[2:3], v13, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e64 v10, s[2:3], v12, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e64 v9, s[2:3], v10, v9
-; GFX9-NEXT:    v_add_u32_e32 v12, v13, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[2:3]
-; GFX9-NEXT:    v_add3_u32 v10, v12, v10, v4
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[2:3], v5, v9
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[6:7], v11, v10, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v12, s9, v5
-; GFX9-NEXT:    v_mul_lo_u32 v13, s8, v9
-; GFX9-NEXT:    v_mul_hi_u32 v14, s8, v5
-; GFX9-NEXT:    v_mul_lo_u32 v15, s8, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v3, v7, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v12, v13, v14
-; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v15
-; GFX9-NEXT:    v_mul_lo_u32 v8, v5, v3
-; GFX9-NEXT:    v_add_u32_e32 v10, v11, v10
-; GFX9-NEXT:    v_mul_hi_u32 v11, v5, v15
-; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v15
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v11, v9, v3
-; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
-; GFX9-NEXT:    v_mul_hi_u32 v8, v5, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v9, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v11, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v11, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
-; GFX9-NEXT:    v_add_u32_e32 v11, v12, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v11, v8, v3
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v10, v3, s[2:3]
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v5, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, s15, v7
-; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v3
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v6, v2, s[0:1]
-; GFX9-NEXT:    v_mul_hi_u32 v2, s14, v7
-; GFX9-NEXT:    v_mul_hi_u32 v7, s15, v7
+; GFX9-NEXT:    v_mul_lo_u32 v9, v12, v17
+; GFX9-NEXT:    v_mul_lo_u32 v13, v5, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v10, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v17
+; GFX9-NEXT:    v_mul_hi_u32 v14, v12, v17
+; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v9, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v9, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v10, v12, v4
+; GFX9-NEXT:    v_add_u32_e32 v9, v13, v9
+; GFX9-NEXT:    v_mul_hi_u32 v13, v5, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v12, v4
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], v10, v14
+; GFX9-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v10, s[0:1], v10, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v9, s[0:1], v10, v9
+; GFX9-NEXT:    v_add_u32_e32 v13, v14, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add3_u32 v4, v13, v10, v4
+; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v9
+; GFX9-NEXT:    v_addc_co_u32_e64 v9, s[0:1], v12, v4, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v5
+; GFX9-NEXT:    v_mul_lo_u32 v12, s2, v9
+; GFX9-NEXT:    v_mul_hi_u32 v13, s2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v10, s2, v5
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v8, v2, s[0:1]
+; GFX9-NEXT:    v_add3_u32 v8, v4, v12, v13
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v7, v15, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v11, v9, v10
+; GFX9-NEXT:    v_mul_lo_u32 v12, v5, v8
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v7, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v3, v5, v10
+; GFX9-NEXT:    v_mul_hi_u32 v10, v9, v10
+; GFX9-NEXT:    v_add_co_u32_e64 v7, s[0:1], v11, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v7, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v8
+; GFX9-NEXT:    v_add_u32_e32 v3, v11, v3
+; GFX9-NEXT:    v_mul_hi_u32 v11, v5, v8
+; GFX9-NEXT:    v_mul_hi_u32 v8, v9, v8
+; GFX9-NEXT:    v_add_co_u32_e64 v7, s[0:1], v7, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v7, s[0:1], v7, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v7, v3
+; GFX9-NEXT:    v_add_u32_e32 v10, v10, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add3_u32 v7, v10, v7, v8
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v5, v3
+; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], v9, v7, s[0:1]
+; GFX9-NEXT:    v_mul_lo_u32 v8, s15, v3
+; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v6, v2, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v2, s14, v3
+; GFX9-NEXT:    v_mul_hi_u32 v3, s15, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, s15, v3
+; GFX9-NEXT:    v_mul_lo_u32 v6, s15, v7
 ; GFX9-NEXT:    v_add_u32_e32 v2, v8, v2
-; GFX9-NEXT:    v_mul_hi_u32 v8, s14, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, s15, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
+; GFX9-NEXT:    v_mul_hi_u32 v8, s14, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, s15, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GFX9-NEXT:    v_add3_u32 v3, v7, v6, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add3_u32 v3, v6, v3, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s11, v2
 ; GFX9-NEXT:    v_mul_lo_u32 v7, s10, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v8, s10, v2
@@ -1590,7 +1582,6 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, 1, v14
 ; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, 0, v15, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v14, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v14, v15, v16, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e64 v15, s[0:1], s10, v11
@@ -1610,16 +1601,13 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ;
 ; GFX10-LABEL: udivrem_v2i64:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_clause 0x2
 ; GFX10-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x20
-; GFX10-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x0
-; GFX10-NEXT:    s_load_dwordx4 s[16:19], s[4:5], 0x10
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s9
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v2, s11
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s8
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v3, s10
-; GFX10-NEXT:    s_sub_u32 s2, 0, s8
+; GFX10-NEXT:    s_sub_u32 s6, 0, s8
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f800000, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
@@ -1627,14 +1615,14 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_add_f32_e32 v1, v2, v3
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    s_subb_u32 s1, 0, s9
+; GFX10-NEXT:    s_subb_u32 s7, 0, s9
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX10-NEXT:    s_sub_u32 s3, 0, s10
+; GFX10-NEXT:    s_sub_u32 s12, 0, s10
 ; GFX10-NEXT:    s_cselect_b32 s0, 1, 0
 ; GFX10-NEXT:    s_and_b32 s0, s0, 1
 ; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
-; GFX10-NEXT:    s_subb_u32 s6, 0, s11
+; GFX10-NEXT:    s_subb_u32 s13, 0, s11
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
@@ -1647,16 +1635,16 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX10-NEXT:    v_add_f32_e32 v0, v4, v0
 ; GFX10-NEXT:    v_add_f32_e32 v1, v5, v1
-; GFX10-NEXT:    v_mul_lo_u32 v4, s2, v2
-; GFX10-NEXT:    v_mul_lo_u32 v8, s3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, s6, v2
+; GFX10-NEXT:    v_mul_lo_u32 v8, s12, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_mul_lo_u32 v5, s1, v0
-; GFX10-NEXT:    v_mul_hi_u32 v6, s2, v0
-; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v1
-; GFX10-NEXT:    v_mul_hi_u32 v10, s3, v1
-; GFX10-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s3, v1
+; GFX10-NEXT:    v_mul_lo_u32 v5, s7, v0
+; GFX10-NEXT:    v_mul_hi_u32 v6, s6, v0
+; GFX10-NEXT:    v_mul_lo_u32 v9, s13, v1
+; GFX10-NEXT:    v_mul_hi_u32 v10, s12, v1
+; GFX10-NEXT:    v_mul_lo_u32 v7, s6, v0
+; GFX10-NEXT:    v_mul_lo_u32 v11, s12, v1
 ; GFX10-NEXT:    v_add3_u32 v4, v5, v4, v6
 ; GFX10-NEXT:    v_add3_u32 v8, v9, v8, v10
 ; GFX10-NEXT:    v_mul_lo_u32 v5, v2, v7
@@ -1699,191 +1687,192 @@ define amdgpu_kernel void @udivrem_v2i64(<2 x i64> addrspace(1)* %out0, <2 x i64
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
 ; GFX10-NEXT:    v_add3_u32 v4, v7, v6, v4
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v5
+; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
 ; GFX10-NEXT:    v_add3_u32 v5, v11, v10, v8
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v6, s0, v2, v4, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v9
-; GFX10-NEXT:    v_mul_lo_u32 v8, s1, v0
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v10, s1, v3, v5, s0
-; GFX10-NEXT:    v_mul_hi_u32 v9, s2, v0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s2, v6
-; GFX10-NEXT:    v_mul_lo_u32 v13, s6, v1
-; GFX10-NEXT:    v_mul_hi_u32 v14, s3, v1
-; GFX10-NEXT:    v_mul_lo_u32 v15, s3, v10
-; GFX10-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GFX10-NEXT:    v_mul_lo_u32 v12, s3, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v2, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v5
-; GFX10-NEXT:    v_add3_u32 v8, v8, v11, v9
-; GFX10-NEXT:    v_add3_u32 v13, v13, v15, v14
-; GFX10-NEXT:    v_mul_lo_u32 v16, v6, v7
-; GFX10-NEXT:    v_mul_lo_u32 v14, v0, v8
-; GFX10-NEXT:    v_mul_hi_u32 v17, v0, v7
-; GFX10-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GFX10-NEXT:    v_mul_lo_u32 v15, v6, v8
-; GFX10-NEXT:    v_mul_lo_u32 v9, v10, v12
-; GFX10-NEXT:    v_mul_hi_u32 v18, v0, v8
-; GFX10-NEXT:    v_mul_hi_u32 v6, v6, v8
-; GFX10-NEXT:    v_mul_lo_u32 v8, v1, v13
-; GFX10-NEXT:    v_add_co_u32 v14, s1, v16, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v7, s1, v15, v7
-; GFX10-NEXT:    v_mul_hi_u32 v11, v1, v12
-; GFX10-NEXT:    v_mul_hi_u32 v12, v10, v12
-; GFX10-NEXT:    v_mul_lo_u32 v19, v10, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v8, s1, v9, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v14, s1, v14, v17
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v12, s1, v19, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v7, s1, v7, v18
-; GFX10-NEXT:    v_add_nc_u32_e32 v14, v16, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v8, s1, v8, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX10-NEXT:    v_add_co_u32 v7, s1, v7, v14
-; GFX10-NEXT:    v_mul_hi_u32 v20, v1, v13
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, v15, v18
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, v9, v8
-; GFX10-NEXT:    v_add3_u32 v4, v11, v14, v6
-; GFX10-NEXT:    v_add_co_u32 v12, s1, v12, v20
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s1
-; GFX10-NEXT:    v_mul_hi_u32 v6, v10, v13
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v4, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v4, s1, v12, v8
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v7
-; GFX10-NEXT:    v_add_nc_u32_e32 v9, v17, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v9
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v6, s7, v0
+; GFX10-NEXT:    v_mul_hi_u32 v7, s6, v0
+; GFX10-NEXT:    v_mul_lo_u32 v5, s6, v2
+; GFX10-NEXT:    v_mul_lo_u32 v9, s13, v1
+; GFX10-NEXT:    v_mul_hi_u32 v10, s12, v1
+; GFX10-NEXT:    v_mul_lo_u32 v11, s12, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, s6, v0
+; GFX10-NEXT:    v_mul_lo_u32 v8, s12, v1
+; GFX10-NEXT:    v_add3_u32 v5, v6, v5, v7
+; GFX10-NEXT:    v_add3_u32 v9, v9, v11, v10
+; GFX10-NEXT:    v_mul_lo_u32 v12, v2, v4
+; GFX10-NEXT:    v_mul_lo_u32 v10, v0, v5
+; GFX10-NEXT:    v_mul_hi_u32 v13, v0, v4
+; GFX10-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX10-NEXT:    v_mul_lo_u32 v11, v2, v5
+; GFX10-NEXT:    v_mul_lo_u32 v6, v3, v8
+; GFX10-NEXT:    v_mul_lo_u32 v15, v1, v9
+; GFX10-NEXT:    v_mul_hi_u32 v7, v1, v8
+; GFX10-NEXT:    v_mul_hi_u32 v8, v3, v8
+; GFX10-NEXT:    v_add_co_u32 v10, s6, v12, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v4, s6, v11, v4
+; GFX10-NEXT:    v_mul_lo_u32 v16, v3, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v6, s6, v6, v15
+; GFX10-NEXT:    v_mul_hi_u32 v14, v0, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v10, s6, v10, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v8, s6, v16, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v4, s6, v4, v14
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, v12, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v6, s6, v6, v7
+; GFX10-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v4, s6, v4, v10
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v11, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s6
+; GFX10-NEXT:    v_mul_hi_u32 v17, v1, v9
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v15, v6
+; GFX10-NEXT:    v_add3_u32 v5, v7, v10, v5
+; GFX10-NEXT:    v_mul_hi_u32 v9, v3, v9
 ; GFX10-NEXT:    v_mov_b32_e32 v10, 0
-; GFX10-NEXT:    v_add3_u32 v5, v9, v8, v6
-; GFX10-NEXT:    v_mul_lo_u32 v6, s17, v0
-; GFX10-NEXT:    v_mul_lo_u32 v7, s16, v2
-; GFX10-NEXT:    v_mul_hi_u32 v8, s17, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s16, v0
-; GFX10-NEXT:    v_mul_lo_u32 v9, s17, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v3, vcc_lo, v3, v5, s0
-; GFX10-NEXT:    v_mul_hi_u32 v5, s16, v2
-; GFX10-NEXT:    v_mul_hi_u32 v2, s17, v2
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v8, s0, v9, v8
-; GFX10-NEXT:    v_add_co_u32 v0, s1, v6, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v5, s0, v8, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v7, v0
+; GFX10-NEXT:    v_add_co_u32 v8, s6, v8, v17
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s6
+; GFX10-NEXT:    v_add_co_u32 v4, s12, v8, v6
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    v_mul_lo_u32 v6, s1, v0
+; GFX10-NEXT:    v_mul_lo_u32 v8, s0, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v13, v11
+; GFX10-NEXT:    v_mul_hi_u32 v11, s1, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s12
+; GFX10-NEXT:    v_mul_lo_u32 v12, s1, v2
 ; GFX10-NEXT:    v_add_co_u32 v1, vcc_lo, v1, v4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v0, s0, v5, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
-; GFX10-NEXT:    v_mul_lo_u32 v6, s19, v1
-; GFX10-NEXT:    v_mul_hi_u32 v7, s18, v1
-; GFX10-NEXT:    v_mul_hi_u32 v1, s19, v1
-; GFX10-NEXT:    v_mul_lo_u32 v8, s9, v0
+; GFX10-NEXT:    v_add_co_u32 v6, s12, v6, v8
+; GFX10-NEXT:    v_add3_u32 v5, v7, v5, v9
+; GFX10-NEXT:    v_mul_hi_u32 v7, s0, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s12
+; GFX10-NEXT:    v_add_co_u32 v0, s13, v6, v0
+; GFX10-NEXT:    v_add_co_u32 v9, s12, v12, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s13
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s12
+; GFX10-NEXT:    v_mul_hi_u32 v2, s1, v2
+; GFX10-NEXT:    v_add_co_u32 v7, s12, v9, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v8, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s12
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v5, vcc_lo
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    v_add_co_u32 v0, s12, v7, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s12
+; GFX10-NEXT:    v_mul_lo_u32 v6, s3, v1
+; GFX10-NEXT:    v_mul_lo_u32 v8, s2, v3
+; GFX10-NEXT:    v_mul_lo_u32 v9, s9, v0
+; GFX10-NEXT:    v_mul_hi_u32 v11, s8, v0
 ; GFX10-NEXT:    v_add3_u32 v2, v4, v5, v2
-; GFX10-NEXT:    v_mul_lo_u32 v4, s18, v3
-; GFX10-NEXT:    v_mul_lo_u32 v5, s19, v3
-; GFX10-NEXT:    v_mul_hi_u32 v9, s8, v0
+; GFX10-NEXT:    v_mul_hi_u32 v7, s2, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v13, s8, v0
-; GFX10-NEXT:    v_mul_lo_u32 v11, s8, v2
-; GFX10-NEXT:    v_mul_hi_u32 v12, s18, v3
-; GFX10-NEXT:    v_mul_hi_u32 v3, s19, v3
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v6, v4
-; GFX10-NEXT:    v_add_co_u32 v1, s1, v5, v1
-; GFX10-NEXT:    v_add3_u32 v5, v8, v11, v9
+; GFX10-NEXT:    v_mul_hi_u32 v1, s3, v1
+; GFX10-NEXT:    v_mul_lo_u32 v4, s3, v3
+; GFX10-NEXT:    v_mul_lo_u32 v12, s8, v2
+; GFX10-NEXT:    v_add_co_u32 v6, s12, v6, v8
+; GFX10-NEXT:    v_mul_hi_u32 v5, s2, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s12
+; GFX10-NEXT:    v_mul_hi_u32 v3, s3, v3
+; GFX10-NEXT:    v_add_co_u32 v1, s12, v4, v1
+; GFX10-NEXT:    v_add3_u32 v9, v9, v12, v11
+; GFX10-NEXT:    v_sub_co_u32 v11, vcc_lo, s0, v13
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v6, v7
+; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s1, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s0
-; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, s16, v13
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v7
-; GFX10-NEXT:    v_sub_nc_u32_e32 v7, s17, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v5, s0, s17, v5, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v8
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s0, s1, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v11
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s9, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, v6, v4
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v8, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s0, 0, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v17, s0, v0, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s0, 0, v2, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v15, v11, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, v16, v12
-; GFX10-NEXT:    v_add_co_u32 v12, s0, v1, v4
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s9, v9
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v8, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s12
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, v11, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s0, 0, v7, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v1, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v9
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s9, v7, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, v4, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, v13, v12, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s0
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v1, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v17, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v18, s0
-; GFX10-NEXT:    v_add3_u32 v3, v11, v1, v3
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v1, vcc_lo, s9, v7, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
-; GFX10-NEXT:    v_mul_lo_u32 v6, s11, v12
-; GFX10-NEXT:    v_mul_lo_u32 v7, s10, v3
-; GFX10-NEXT:    v_mul_hi_u32 v11, s10, v12
-; GFX10-NEXT:    v_sub_co_u32 v19, s0, v13, s8
-; GFX10-NEXT:    v_mul_lo_u32 v16, s10, v12
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v20, s0, 0, v1, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v17, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v18, v15, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v6, v6, v7, v11
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v14, v14, v20, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v7, s1, s18, v16
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s2, s19, v6, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v2, v4, s0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s19, v6
-; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s11, v9
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v13, v19, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s11, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v14, s0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s1, s11, v2, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s10, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s1
-; GFX10-NEXT:    v_sub_co_u32 v13, s1, v7, s10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s2, 0, v2, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s11, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v16, vcc_lo, v12, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v17, vcc_lo, 0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s11, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v11, vcc_lo, v16, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, 0, v17, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, vcc_lo, s11, v2, s1
+; GFX10-NEXT:    v_add_co_u32 v5, s0, v0, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v16, s0, 0, v2, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v15
+; GFX10-NEXT:    v_add3_u32 v3, v4, v1, v3
+; GFX10-NEXT:    v_mul_hi_u32 v18, s10, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v13, v8, s0
+; GFX10-NEXT:    v_mul_lo_u32 v13, s11, v6
+; GFX10-NEXT:    v_mul_lo_u32 v17, s10, v3
+; GFX10-NEXT:    v_add_co_u32 v1, s0, v5, 1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v8
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v4, s0, 0, v16, s0
+; GFX10-NEXT:    v_sub_co_u32 v19, s0, v14, s8
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v5, s10, v6
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v16, v4, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v13, v13, v17, v18
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s3, v13
+; GFX10-NEXT:    v_sub_co_u32 v12, s0, s2, v5
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v16, s1, s3, v13, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s11, v2, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v12
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v8
+; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s11, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s0
+; GFX10-NEXT:    v_sub_co_u32 v13, s0, v12, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v14, v19, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v14, s2, 0, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v15, v7, s1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s11, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v11, v4, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s11, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s11, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s10, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s1
+; GFX10-NEXT:    v_add_co_u32 v15, s1, v6, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s1, 0, v3, s1
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s11, v14
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s1
+; GFX10-NEXT:    v_add_co_u32 v11, s1, v15, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s1, 0, v17, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v8
 ; GFX10-NEXT:    v_sub_co_u32 v8, s1, v13, s10
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, v16, v11, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v6
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v13, v8, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v8, v15, v2, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v12, v11, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v16, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v9, v8, s1
-; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[12:13]
-; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[14:15]
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v15, v11, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v17, v18, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v5
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v13, v8, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v14, v2, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v11, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v12, v8, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v16, v13, s1
+; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
+; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv <2 x i64> %x, %y
   store <2 x i64> %div, <2 x i64> addrspace(1)* %out0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 63b4a5eb0d6e..8a04daed6b3b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -58,38 +58,36 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v1, v8, vcc
-; CHECK-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v6, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v0
-; CHECK-NEXT:    v_mul_hi_u32 v10, v6, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, v6, v9
-; CHECK-NEXT:    v_mul_lo_u32 v11, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
+; CHECK-NEXT:    v_mul_hi_u32 v9, v6, v0
+; CHECK-NEXT:    v_mul_lo_u32 v6, v6, v1
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v8
+; CHECK-NEXT:    v_mul_hi_u32 v11, v0, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v1, v8
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v6
-; CHECK-NEXT:    v_mul_hi_u32 v13, v0, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v9, v6
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v10, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v7
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v11
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v1, v6
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v5, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, v5, v0
@@ -192,24 +190,24 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; CHECK-NEXT:    v_mov_b32_e32 v1, s3
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, s3
-; CHECK-NEXT:    s_sub_u32 s6, 0, s2
-; CHECK-NEXT:    s_cselect_b32 s4, 1, 0
+; CHECK-NEXT:    s_sub_u32 s4, 0, s2
+; CHECK-NEXT:    s_cselect_b32 s5, 1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v2
-; CHECK-NEXT:    s_and_b32 s4, s4, 1
+; CHECK-NEXT:    s_and_b32 s5, s5, 1
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; CHECK-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT:    s_cmp_lg_u32 s4, 0
-; CHECK-NEXT:    s_subb_u32 s7, 0, s3
+; CHECK-NEXT:    s_cmp_lg_u32 s5, 0
+; CHECK-NEXT:    s_subb_u32 s5, 0, s3
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; CHECK-NEXT:    v_trunc_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT:    v_mul_lo_u32 v4, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v5, s6, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, s7, v0
-; CHECK-NEXT:    v_mul_hi_u32 v7, s6, v0
+; CHECK-NEXT:    v_mul_lo_u32 v4, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, s4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v6, s5, v0
+; CHECK-NEXT:    v_mul_hi_u32 v7, s4, v0
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v2, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v5
@@ -234,38 +232,36 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT:    v_addc_u32_e64 v5, s[4:5], v2, v4, vcc
-; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v4
-; CHECK-NEXT:    v_mul_lo_u32 v4, s6, v0
-; CHECK-NEXT:    v_mul_lo_u32 v6, s7, v0
-; CHECK-NEXT:    v_mul_hi_u32 v7, s6, v0
-; CHECK-NEXT:    v_mul_lo_u32 v8, s6, v5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v4
-; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v4
-; CHECK-NEXT:    v_mul_hi_u32 v4, v5, v4
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v6
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v6
-; CHECK-NEXT:    v_mul_hi_u32 v11, v0, v6
-; CHECK-NEXT:    v_mul_hi_u32 v5, v5, v6
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v9, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v8, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v11
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v9
-; CHECK-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v6
-; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v2, v5, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v4, s4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v5, s5, v0
+; CHECK-NEXT:    v_mul_hi_u32 v6, s4, v0
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v4
+; CHECK-NEXT:    v_mul_hi_u32 v9, v0, v4
+; CHECK-NEXT:    v_mul_hi_u32 v4, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; CHECK-NEXT:    v_mul_lo_u32 v6, v0, v5
+; CHECK-NEXT:    v_mul_lo_u32 v7, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v10, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v5, v2, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v2, vcc, v2, v5, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v4, s1, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v5, s0, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, s1, v0
@@ -403,38 +399,36 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], v9, v13, vcc
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v8
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v8
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v8
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v13
-; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v13
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v13
-; GISEL-NEXT:    v_mul_hi_u32 v15, v8, v10
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v10
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v9, vcc, 0, v9, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v8
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v8
@@ -529,38 +523,36 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v5, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v5, v11, vcc
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v11
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v4
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v11, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v4
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v4
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v11
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v4, v8
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v8
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, v5, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v4, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v5, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v5, vcc, 0, v5, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
@@ -672,38 +664,36 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v1, v12, vcc
-; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v2, v0
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v0
-; CGP-NEXT:    v_mul_hi_u32 v14, v2, v0
-; CGP-NEXT:    v_mul_lo_u32 v2, v2, v13
-; CGP-NEXT:    v_mul_lo_u32 v15, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v16, v0, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v13, v12
-; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v3, v2
-; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v14
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v0
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v0, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v1, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
 ; CGP-NEXT:    v_mul_lo_u32 v3, v0, v2
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v2
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v2
-; CGP-NEXT:    v_mul_hi_u32 v2, v13, v2
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v15, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v13, v3
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v15
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v12, v3
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v12
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v2
+; CGP-NEXT:    v_mul_hi_u32 v16, v0, v2
+; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v14, v3
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v12
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v2, v11, v0
 ; CGP-NEXT:    v_mul_hi_u32 v3, v10, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v11, v0
@@ -833,38 +823,36 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT:    v_addc_u32_e64 v11, s[4:5], v3, v10, vcc
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v10
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v10, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v2
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v11
-; CGP-NEXT:    v_mul_lo_u32 v13, v11, v10
-; CGP-NEXT:    v_mul_hi_u32 v14, v2, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v10
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v5, v4
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v11, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v3, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
 ; CGP-NEXT:    v_mul_lo_u32 v5, v2, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v11, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v11, v4
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v12, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v11, v5
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v13
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v10
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v12, v5
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v9, v2
 ; CGP-NEXT:    v_mul_hi_u32 v5, v8, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v9, v2
@@ -982,13 +970,13 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    s_mov_b32 s6, 0xffed2705
-; CHECK-NEXT:    s_mov_b32 s7, 0x12d8fb
-; CHECK-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; CHECK-NEXT:    s_mov_b32 s4, 0xffed2705
+; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
 ; CHECK-NEXT:    s_bfe_i32 s5, -1, 0x10000
+; CHECK-NEXT:    s_bfe_i32 s7, -1, 0x10000
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
-; CHECK-NEXT:    v_mov_b32_e32 v3, s4
-; CHECK-NEXT:    v_mov_b32_e32 v4, s5
+; CHECK-NEXT:    v_mov_b32_e32 v3, s5
+; CHECK-NEXT:    v_mov_b32_e32 v4, s7
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v2
@@ -996,10 +984,10 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v5
-; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v5
+; CHECK-NEXT:    v_mul_lo_u32 v7, s4, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v8, -1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v9, s6, v2
+; CHECK-NEXT:    v_mul_hi_u32 v9, s4, v2
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v7
@@ -1024,38 +1012,36 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT:    v_addc_u32_e64 v7, s[4:5], v5, v6, vcc
-; CHECK-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v6, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, -1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v9, s6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v10, s6, v7
-; CHECK-NEXT:    v_mul_lo_u32 v11, v7, v6
-; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v7, v6
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v10
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v9
-; CHECK-NEXT:    v_mul_lo_u32 v9, v2, v8
-; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v8
-; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v8
-; CHECK-NEXT:    v_mul_hi_u32 v7, v7, v8
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v11, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v10, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v11
-; CHECK-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v8
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v6, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, -1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, s4, v2
+; CHECK-NEXT:    v_mul_lo_u32 v9, s4, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v5, v6
+; CHECK-NEXT:    v_mul_hi_u32 v11, v2, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v5, v6
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v7
+; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v7
+; CHECK-NEXT:    v_mul_hi_u32 v12, v2, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v0, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
@@ -1076,28 +1062,28 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_mul_lo_u32 v7, s7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, s6, v2
 ; CHECK-NEXT:    v_mul_lo_u32 v8, 0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, s7, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, s6, v2
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v5, s7, v5
+; CHECK-NEXT:    v_mul_lo_u32 v5, s6, v5
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; CHECK-NEXT:    v_subb_u32_e64 v5, s[4:5], v1, v2, vcc
 ; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v1, v2
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v0
+; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s7, v0
+; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s7, v3
+; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
 ; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v6, vcc, s7, v3
+; CHECK-NEXT:    v_subrev_i32_e32 v6, vcc, s6, v3
 ; CHECK-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
@@ -1114,9 +1100,9 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_urem_v2i64_oddk_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s12, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s12
-; GISEL-NEXT:    s_sub_u32 s8, 0, s12
+; GISEL-NEXT:    s_mov_b32 s8, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s8
+; GISEL-NEXT:    s_sub_u32 s6, 0, s8
 ; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v6, v4
@@ -1126,250 +1112,246 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v6
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s9, 0, 0
-; GISEL-NEXT:    s_bfe_i32 s10, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s11, -1, 0x10000
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
-; GISEL-NEXT:    s_sub_u32 s13, 0, s12
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s5, -1, 0x10000
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v4
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v5
+; GISEL-NEXT:    v_mov_b32_e32 v5, s4
+; GISEL-NEXT:    v_mov_b32_e32 v4, s5
+; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
+; GISEL-NEXT:    s_sub_u32 s9, 0, s8
 ; GISEL-NEXT:    s_cselect_b32 s4, 1, 0
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v7
+; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
 ; GISEL-NEXT:    s_and_b32 s4, s4, 1
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v9
+; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v9
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; GISEL-NEXT:    s_cmp_lg_u32 s4, 0
-; GISEL-NEXT:    s_subb_u32 s6, 0, 0
-; GISEL-NEXT:    v_mul_lo_u32 v8, s13, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v10, s13, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v12, s13, v4
-; GISEL-NEXT:    v_mul_lo_u32 v13, s8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v14, s9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v15, s8, v5
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v16, v4, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v6, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v13
-; GISEL-NEXT:    v_mul_hi_u32 v17, v5, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v13
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GISEL-NEXT:    v_mul_lo_u32 v15, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v18, v4, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GISEL-NEXT:    v_mul_lo_u32 v19, v5, v9
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v17, v5, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v15, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
+; GISEL-NEXT:    s_subb_u32 s10, 0, 0
+; GISEL-NEXT:    v_mul_lo_u32 v10, s9, v8
+; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s11, -1, 0x10000
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v9
+; GISEL-NEXT:    v_mul_lo_u32 v12, s9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v13, s10, v6
+; GISEL-NEXT:    v_mul_hi_u32 v14, s9, v6
+; GISEL-NEXT:    v_mov_b32_e32 v15, s4
+; GISEL-NEXT:    v_mul_lo_u32 v16, s6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v17, s7, v7
+; GISEL-NEXT:    v_mul_hi_u32 v18, s6, v7
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
+; GISEL-NEXT:    v_mul_lo_u32 v13, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v19, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
+; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v16
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v16
+; GISEL-NEXT:    v_mul_hi_u32 v16, v9, v16
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
+; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v11
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v14
+; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v10
+; GISEL-NEXT:    v_mul_lo_u32 v17, v8, v10
+; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v19
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v9, v11
+; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v17, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v17, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; GISEL-NEXT:    v_mul_hi_u32 v18, v7, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v15, v16
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v18
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v18, v17
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, s9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v12, s10, v6
+; GISEL-NEXT:    v_mul_hi_u32 v13, s9, v6
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v11, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v14, s7, v7
+; GISEL-NEXT:    v_mul_hi_u32 v16, s6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v17, s9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v10
+; GISEL-NEXT:    v_mul_hi_u32 v19, v6, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; GISEL-NEXT:    v_mul_lo_u32 v17, s6, v9
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v15
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v6, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, s13, v4
-; GISEL-NEXT:    v_mul_lo_u32 v12, s6, v4
-; GISEL-NEXT:    v_mul_hi_u32 v14, s13, v4
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v13
-; GISEL-NEXT:    v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v15, s8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v16, s9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v17, s8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v18, s8, v13
-; GISEL-NEXT:    v_mul_lo_u32 v19, v13, v15
-; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v18
-; GISEL-NEXT:    v_mul_hi_u32 v18, v5, v15
-; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v17
-; GISEL-NEXT:    v_mul_lo_u32 v17, v5, v16
-; GISEL-NEXT:    v_add_i32_e64 v17, s[6:7], v19, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v17, s[6:7], v17, v18
-; GISEL-NEXT:    v_mul_lo_u32 v17, s13, v10
-; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[8:9], v12, v17
-; GISEL-NEXT:    v_mul_hi_u32 v17, v4, v11
-; GISEL-NEXT:    v_add_i32_e64 v12, s[8:9], v12, v14
-; GISEL-NEXT:    v_mul_lo_u32 v14, v4, v12
-; GISEL-NEXT:    v_add_i32_e64 v14, s[8:9], v18, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[8:9], v14, v17
-; GISEL-NEXT:    v_mov_b32_e32 v14, s10
-; GISEL-NEXT:    v_mov_b32_e32 v17, s11
-; GISEL-NEXT:    s_bfe_i32 s13, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s14, -1, 0x10000
-; GISEL-NEXT:    v_add_i32_e64 v6, s[10:11], v6, v8
-; GISEL-NEXT:    v_mov_b32_e32 v8, s13
-; GISEL-NEXT:    v_add_i32_e64 v7, s[10:11], v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[8:9], v18, v15
-; GISEL-NEXT:    v_mul_lo_u32 v18, v10, v12
-; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v4, v12
-; GISEL-NEXT:    v_add_i32_e64 v9, s[8:9], v18, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[8:9], v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[8:9], v18, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], v19, v18
-; GISEL-NEXT:    v_mul_lo_u32 v19, v13, v16
-; GISEL-NEXT:    v_mul_hi_u32 v13, v13, v16
-; GISEL-NEXT:    v_mul_hi_u32 v16, v5, v16
-; GISEL-NEXT:    v_add_i32_e64 v11, s[6:7], v19, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[6:7], v11, v16
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v16, s[6:7], v19, v16
-; GISEL-NEXT:    v_mov_b32_e32 v19, s14
-; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[6:7], v11, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[6:7], v12, v15
-; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v18
-; GISEL-NEXT:    v_add_i32_e64 v10, s[6:7], v10, v12
-; GISEL-NEXT:    v_add_i32_e64 v12, s[6:7], v13, v15
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v10, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v7, vcc, v7, v12, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT:    v_mul_lo_u32 v13, v2, v6
-; GISEL-NEXT:    v_mul_lo_u32 v15, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v16, v2, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v18, v0, v7
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
+; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v7, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT:    v_mul_lo_u32 v16, v7, v14
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v12
+; GISEL-NEXT:    v_mul_lo_u32 v16, v8, v12
+; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v18, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v19
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v19
+; GISEL-NEXT:    v_mul_lo_u32 v19, v9, v14
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v16, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v16, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v19, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
+; GISEL-NEXT:    v_mov_b32_e32 v19, s11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v14, v9, v14
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v16
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v7
-; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v15, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v14, v2, v8
+; GISEL-NEXT:    v_mul_lo_u32 v16, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v17, v2, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v3, v8
+; GISEL-NEXT:    v_mul_lo_u32 v18, v0, v9
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v16, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v16
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v16
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, s12, v4
-; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, s12, v4
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_mul_lo_u32 v12, s12, v5
-; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, s12, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_mul_lo_u32 v6, s12, v6
-; GISEL-NEXT:    v_mul_lo_u32 v7, s12, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v15, v7
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v4, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s12, v2
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v12
-; GISEL-NEXT:    v_subb_u32_e64 v7, s[6:7], v1, v5, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v1, s[6:7], v1, v5
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[6:7], s12, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[6:7]
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v18, v17
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, s8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v14, 0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, s8, v6
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_mul_lo_u32 v13, s8, v7
+; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, s8, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_mul_lo_u32 v8, s8, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v14, v8
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v16, v9
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v12
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v3, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v6
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v13
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], v1, v7, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v1, s[6:7], v1, v7
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[6:7], s8, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v15, v6, s[6:7]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GISEL-NEXT:    v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
-; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, s12, v2
+; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v2
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s12, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v10, vcc, s12, v0
+; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
+; GISEL-NEXT:    v_subrev_i32_e32 v11, vcc, s8, v0
 ; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s12, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v12, vcc, s12, v8
-; GISEL-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v19, v10, vcc
+; GISEL-NEXT:    v_subrev_i32_e32 v13, vcc, s8, v7
+; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v14, vcc, s12, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
+; GISEL-NEXT:    v_subrev_i32_e32 v12, vcc, s8, v11
 ; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, v10, v14, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v13, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v11, v12, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v14, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v15, s[4:5]
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v8, v3, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_oddk_denom:
@@ -1377,250 +1359,246 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_mov_b32 s8, 0xffed2705
-; CGP-NEXT:    s_mov_b32 s12, 0x12d8fb
-; CGP-NEXT:    s_bfe_i32 s10, -1, 0x10000
-; CGP-NEXT:    s_bfe_i32 s11, -1, 0x10000
-; CGP-NEXT:    s_bfe_i32 s13, -1, 0x10000
-; CGP-NEXT:    s_bfe_i32 s14, -1, 0x10000
+; CGP-NEXT:    s_mov_b32 s6, 0xffed2705
+; CGP-NEXT:    s_mov_b32 s8, 0x12d8fb
+; CGP-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; CGP-NEXT:    s_bfe_i32 s5, -1, 0x10000
+; CGP-NEXT:    s_bfe_i32 s7, -1, 0x10000
+; CGP-NEXT:    s_bfe_i32 s9, -1, 0x10000
 ; CGP-NEXT:    v_mov_b32_e32 v6, v4
+; CGP-NEXT:    v_mov_b32_e32 v7, s4
+; CGP-NEXT:    v_mov_b32_e32 v8, s5
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; CGP-NEXT:    v_mov_b32_e32 v9, s7
 ; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v5
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v6
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
 ; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v4
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
+; CGP-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v5
 ; CGP-NEXT:    v_trunc_f32_e32 v6, v6
-; CGP-NEXT:    v_trunc_f32_e32 v7, v7
+; CGP-NEXT:    v_trunc_f32_e32 v10, v10
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v10
+; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT:    v_mul_lo_u32 v8, s8, v6
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, s8, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, s8, v4
-; CGP-NEXT:    v_mul_lo_u32 v11, -1, v4
-; CGP-NEXT:    v_mul_hi_u32 v12, s8, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, s8, v5
-; CGP-NEXT:    v_mul_lo_u32 v14, -1, v5
-; CGP-NEXT:    v_mul_hi_u32 v15, s8, v5
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; CGP-NEXT:    v_mul_lo_u32 v11, v6, v10
-; CGP-NEXT:    v_mul_hi_u32 v16, v4, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v6, v10
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v13
-; CGP-NEXT:    v_mul_hi_u32 v17, v5, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v7, v13
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
-; CGP-NEXT:    v_mul_lo_u32 v12, v4, v8
-; CGP-NEXT:    v_mul_lo_u32 v15, v6, v8
-; CGP-NEXT:    v_mul_hi_u32 v18, v4, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CGP-NEXT:    v_mul_lo_u32 v19, v5, v9
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v17, v5, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v9
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v15, v10
+; CGP-NEXT:    v_mul_lo_u32 v12, s6, v10
+; CGP-NEXT:    v_mul_lo_u32 v13, s6, v4
+; CGP-NEXT:    v_mul_lo_u32 v14, -1, v4
+; CGP-NEXT:    v_mul_hi_u32 v15, s6, v4
+; CGP-NEXT:    v_mul_lo_u32 v16, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v17, -1, v5
+; CGP-NEXT:    v_mul_hi_u32 v18, s6, v5
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
+; CGP-NEXT:    v_mul_lo_u32 v14, v6, v13
+; CGP-NEXT:    v_mul_hi_u32 v19, v4, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v6, v13
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
+; CGP-NEXT:    v_mul_lo_u32 v17, v10, v16
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
+; CGP-NEXT:    v_mul_hi_u32 v15, v5, v16
+; CGP-NEXT:    v_mul_hi_u32 v16, v10, v16
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
+; CGP-NEXT:    v_mul_lo_u32 v18, v5, v12
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
+; CGP-NEXT:    v_mul_lo_u32 v15, v4, v11
+; CGP-NEXT:    v_mul_lo_u32 v17, v6, v11
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v6, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v19
+; CGP-NEXT:    v_mul_lo_u32 v19, v10, v12
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v17, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v17, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v16
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v19, v18
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; CGP-NEXT:    v_mul_hi_u32 v18, v5, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v10, v12
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v14, v15
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; CGP-NEXT:    v_addc_u32_e64 v10, s[4:5], v6, v8, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, s8, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, -1, v4
-; CGP-NEXT:    v_mul_hi_u32 v14, s8, v4
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v13
-; CGP-NEXT:    v_addc_u32_e64 v13, s[6:7], v7, v9, s[4:5]
-; CGP-NEXT:    v_mul_lo_u32 v15, s8, v5
-; CGP-NEXT:    v_mul_lo_u32 v16, -1, v5
-; CGP-NEXT:    v_mul_hi_u32 v17, s8, v5
-; CGP-NEXT:    v_mul_lo_u32 v18, s8, v13
-; CGP-NEXT:    v_mul_lo_u32 v19, v13, v15
-; CGP-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v18
-; CGP-NEXT:    v_mul_hi_u32 v18, v5, v15
-; CGP-NEXT:    v_add_i32_e64 v16, s[6:7], v16, v17
-; CGP-NEXT:    v_mul_lo_u32 v17, v5, v16
-; CGP-NEXT:    v_add_i32_e64 v17, s[6:7], v19, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v17, s[6:7], v17, v18
-; CGP-NEXT:    v_mul_lo_u32 v17, s8, v10
-; CGP-NEXT:    v_mul_lo_u32 v18, v10, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[8:9], v12, v17
-; CGP-NEXT:    v_mul_hi_u32 v17, v4, v11
-; CGP-NEXT:    v_add_i32_e64 v12, s[8:9], v12, v14
-; CGP-NEXT:    v_mul_lo_u32 v14, v4, v12
-; CGP-NEXT:    v_add_i32_e64 v14, s[8:9], v18, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; CGP-NEXT:    v_add_i32_e64 v14, s[8:9], v14, v17
-; CGP-NEXT:    v_mov_b32_e32 v14, s10
-; CGP-NEXT:    v_mov_b32_e32 v17, s11
-; CGP-NEXT:    v_add_i32_e64 v6, s[10:11], v6, v8
-; CGP-NEXT:    v_mov_b32_e32 v8, s13
-; CGP-NEXT:    v_add_i32_e64 v7, s[10:11], v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v10, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[8:9]
-; CGP-NEXT:    v_add_i32_e64 v15, s[8:9], v18, v15
-; CGP-NEXT:    v_mul_lo_u32 v18, v10, v12
-; CGP-NEXT:    v_mul_hi_u32 v10, v10, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v12
-; CGP-NEXT:    v_add_i32_e64 v9, s[8:9], v18, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; CGP-NEXT:    v_add_i32_e64 v9, s[8:9], v9, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[8:9]
-; CGP-NEXT:    v_add_i32_e64 v12, s[8:9], v18, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v18, s[6:7], v19, v18
-; CGP-NEXT:    v_mul_lo_u32 v19, v13, v16
-; CGP-NEXT:    v_mul_hi_u32 v13, v13, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v5, v16
-; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v19, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v11, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v16, s[6:7], v19, v16
-; CGP-NEXT:    v_mov_b32_e32 v19, s14
-; CGP-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v11, s[6:7], v11, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
-; CGP-NEXT:    v_add_i32_e64 v12, s[6:7], v12, v15
-; CGP-NEXT:    v_add_i32_e64 v15, s[6:7], v16, v18
-; CGP-NEXT:    v_add_i32_e64 v10, s[6:7], v10, v12
-; CGP-NEXT:    v_add_i32_e64 v12, s[6:7], v13, v15
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v10, vcc
-; CGP-NEXT:    v_addc_u32_e64 v7, vcc, v7, v12, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v3, v4
-; CGP-NEXT:    v_mul_hi_u32 v10, v2, v4
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v11, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, s6, v4
+; CGP-NEXT:    v_mul_lo_u32 v13, -1, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, s6, v4
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v16
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v15, -1, v5
+; CGP-NEXT:    v_mul_hi_u32 v16, s6, v5
+; CGP-NEXT:    v_mul_lo_u32 v17, s6, v6
+; CGP-NEXT:    v_mul_lo_u32 v18, v6, v11
+; CGP-NEXT:    v_mul_hi_u32 v19, v4, v11
+; CGP-NEXT:    v_mul_hi_u32 v11, v6, v11
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
+; CGP-NEXT:    v_mul_lo_u32 v17, s6, v10
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT:    v_mul_lo_u32 v17, v10, v12
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_mul_hi_u32 v14, v5, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v10, v12
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; CGP-NEXT:    v_mul_lo_u32 v16, v5, v15
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_mul_lo_u32 v14, v4, v13
+; CGP-NEXT:    v_mul_lo_u32 v16, v6, v13
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v18, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
+; CGP-NEXT:    v_mul_hi_u32 v14, v4, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v19
+; CGP-NEXT:    v_mul_lo_u32 v19, v10, v15
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v16, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT:    v_mul_hi_u32 v17, v5, v15
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
+; CGP-NEXT:    v_mov_b32_e32 v19, s9
+; CGP-NEXT:    v_mul_hi_u32 v13, v6, v13
+; CGP-NEXT:    v_mul_hi_u32 v15, v10, v15
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v16
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v13, vcc
+; CGP-NEXT:    v_mul_lo_u32 v11, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v7, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v0, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v14, vcc
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v5
+; CGP-NEXT:    v_mul_hi_u32 v14, v0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_mul_lo_u32 v13, v2, v6
-; CGP-NEXT:    v_mul_lo_u32 v15, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v16, v2, v6
+; CGP-NEXT:    v_mul_lo_u32 v15, v2, v6
+; CGP-NEXT:    v_mul_lo_u32 v16, v3, v6
+; CGP-NEXT:    v_mul_hi_u32 v17, v2, v6
 ; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
-; CGP-NEXT:    v_mul_lo_u32 v18, v0, v7
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
+; CGP-NEXT:    v_mul_lo_u32 v18, v0, v10
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_mul_lo_u32 v11, v1, v7
-; CGP-NEXT:    v_mul_hi_u32 v12, v0, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v15, v4
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_mul_lo_u32 v12, v1, v10
+; CGP-NEXT:    v_mul_hi_u32 v14, v0, v10
+; CGP-NEXT:    v_mul_hi_u32 v10, v1, v10
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v11, v5
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v16, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v5, s[4:5], v12, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v18, v16
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT:    v_mul_lo_u32 v10, s12, v4
-; CGP-NEXT:    v_mul_lo_u32 v13, 0, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, s12, v4
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, s12, v5
-; CGP-NEXT:    v_mul_lo_u32 v15, 0, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, s12, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; CGP-NEXT:    v_mul_lo_u32 v6, s12, v6
-; CGP-NEXT:    v_mul_lo_u32 v7, s12, v7
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v15, v7
+; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_mul_lo_u32 v13, s8, v4
+; CGP-NEXT:    v_mul_lo_u32 v15, 0, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, s8, v4
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_mul_lo_u32 v14, s8, v5
+; CGP-NEXT:    v_mul_lo_u32 v16, 0, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, s8, v5
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_mul_lo_u32 v6, s8, v6
+; CGP-NEXT:    v_mul_lo_u32 v10, s8, v10
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v15, v6
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v13
 ; CGP-NEXT:    v_subb_u32_e64 v6, s[4:5], v3, v4, vcc
 ; CGP-NEXT:    v_sub_i32_e64 v3, s[4:5], v3, v4
-; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s12, v2
+; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v12
-; CGP-NEXT:    v_subb_u32_e64 v7, s[6:7], v1, v5, s[4:5]
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v14
+; CGP-NEXT:    v_subb_u32_e64 v10, s[6:7], v1, v5, s[4:5]
 ; CGP-NEXT:    v_sub_i32_e64 v1, s[6:7], v1, v5
-; CGP-NEXT:    v_cmp_le_u32_e64 s[6:7], s12, v0
+; CGP-NEXT:    v_cmp_le_u32_e64 s[6:7], s8, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[6:7]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v4, v8, v4, s[6:7]
+; CGP-NEXT:    v_cndmask_b32_e64 v4, v9, v4, s[6:7]
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; CGP-NEXT:    v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
-; CGP-NEXT:    v_subrev_i32_e32 v8, vcc, s12, v2
+; CGP-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v2
 ; CGP-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s12, v8
+; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s8, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v10, vcc, s12, v0
+; CGP-NEXT:    v_subrev_i32_e32 v11, vcc, s8, v0
 ; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s12, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s8, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; CGP-NEXT:    v_cndmask_b32_e32 v9, v19, v9, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v12, vcc, s12, v8
-; CGP-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v3, vcc
+; CGP-NEXT:    v_subrev_i32_e32 v13, vcc, s8, v7
+; CGP-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; CGP-NEXT:    v_cndmask_b32_e32 v11, v17, v11, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v14, vcc, s12, v10
+; CGP-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc
+; CGP-NEXT:    v_subrev_i32_e32 v12, vcc, s8, v11
 ; CGP-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v8, v12, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v9, v10, v14, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v13, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v7, v7, v13, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v8, v11, v12, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v14, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v15, s[4:5]
 ; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v9, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e64 v1, v7, v1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v1, v10, v1, s[4:5]
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v6, v3, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, <i64 1235195, i64 1235195>
@@ -1683,38 +1661,36 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
 ; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT:    v_addc_u32_e64 v9, s[4:5], v1, v8, vcc
-; CHECK-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v8
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v8, v2, v0
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v7, v0
-; CHECK-NEXT:    v_mul_hi_u32 v10, v2, v0
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v9
-; CHECK-NEXT:    v_mul_lo_u32 v11, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v8
-; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v7, v2
-; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v10
+; CHECK-NEXT:    v_mul_hi_u32 v9, v2, v0
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v1
+; CHECK-NEXT:    v_mul_lo_u32 v10, v1, v8
+; CHECK-NEXT:    v_mul_hi_u32 v11, v0, v8
+; CHECK-NEXT:    v_mul_hi_u32 v8, v1, v8
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
 ; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v10, v9, v2
-; CHECK-NEXT:    v_mul_hi_u32 v13, v0, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v9, v2
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v10, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v12
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v9, v7
-; CHECK-NEXT:    v_add_i32_e64 v9, s[4:5], v10, v11
-; CHECK-NEXT:    v_add_i32_e64 v7, s[4:5], v8, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CHECK-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CHECK-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v8
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v12, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v2, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v3, v0
 ; CHECK-NEXT:    v_mul_hi_u32 v0, v4, v0
@@ -1849,38 +1825,36 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT:    v_addc_u32_e64 v12, s[4:5], v9, v13, vcc
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v6
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v11, v6
-; GISEL-NEXT:    v_mul_lo_u32 v14, v10, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v10, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v10, v6
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v11, v10
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v13
-; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v10
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v13
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v14, v11
-; GISEL-NEXT:    v_mul_lo_u32 v14, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v13
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v10
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v13, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v13
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v10
-; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v10, v13
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v12
+; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v10
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT:    v_mul_hi_u32 v14, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v9, vcc, 0, v9, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v0, v9
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v0, v6
@@ -1975,38 +1949,36 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v11
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_addc_u32_e64 v10, s[4:5], v7, v11, vcc
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v11
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v6
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v8, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, v8, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v8, v8, v6
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v11
-; GISEL-NEXT:    v_mul_lo_u32 v12, v6, v8
-; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v11
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v9, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v9
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v11
-; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v8
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v13
-; GISEL-NEXT:    v_add_i32_e64 v9, s[4:5], v11, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v11, s[4:5], v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v8
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v11
-; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_lo_u32 v9, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v8, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v7, vcc, 0, v7, s[4:5]
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v9, v2, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v2, v6
@@ -2121,38 +2093,36 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT:    v_addc_u32_e64 v13, s[4:5], v1, v12, vcc
-; CGP-NEXT:    v_add_i32_e64 v1, s[4:5], v1, v12
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v12, v4, v0
 ; CGP-NEXT:    v_mul_lo_u32 v6, v6, v0
-; CGP-NEXT:    v_mul_hi_u32 v14, v4, v0
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v13
-; CGP-NEXT:    v_mul_lo_u32 v15, v13, v12
-; CGP-NEXT:    v_mul_hi_u32 v16, v0, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v13, v12
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v6, v4
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v14
+; CGP-NEXT:    v_mul_hi_u32 v13, v4, v0
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v1
+; CGP-NEXT:    v_mul_lo_u32 v14, v1, v12
+; CGP-NEXT:    v_mul_hi_u32 v15, v0, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v1, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
 ; CGP-NEXT:    v_mul_lo_u32 v6, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v14, v13, v4
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v13, v4
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v15, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v12, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v14, v15
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v12, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v12, s[4:5], v13, v12
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v4
+; CGP-NEXT:    v_mul_hi_u32 v16, v0, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
 ; CGP-NEXT:    v_mul_hi_u32 v6, v8, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
@@ -2282,38 +2252,36 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
-; CGP-NEXT:    v_addc_u32_e64 v9, s[4:5], v3, v8, vcc
-; CGP-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v8
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v8, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v6, v6, v2
-; CGP-NEXT:    v_mul_hi_u32 v12, v4, v2
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v9
-; CGP-NEXT:    v_mul_lo_u32 v13, v9, v8
-; CGP-NEXT:    v_mul_hi_u32 v14, v2, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v9, v8
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v6, v4
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v12
+; CGP-NEXT:    v_mul_hi_u32 v9, v4, v2
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
+; CGP-NEXT:    v_mul_lo_u32 v12, v3, v8
+; CGP-NEXT:    v_mul_hi_u32 v13, v2, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; CGP-NEXT:    v_mul_lo_u32 v6, v2, v4
-; CGP-NEXT:    v_mul_lo_u32 v12, v9, v4
-; CGP-NEXT:    v_mul_hi_u32 v15, v2, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v9, v4
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v13, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v12, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v9, v6
-; CGP-NEXT:    v_add_i32_e64 v9, s[4:5], v12, v13
-; CGP-NEXT:    v_add_i32_e64 v6, s[4:5], v8, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v8, s[4:5], v9, v8
-; CGP-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
+; CGP-NEXT:    v_mul_lo_u32 v9, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v14, v2, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v4, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v4, v7, v2
 ; CGP-NEXT:    v_mul_hi_u32 v6, v5, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v7, v2
@@ -2529,190 +2497,186 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v19, vcc, v20, v19
-; GISEL-NEXT:    s_bfe_i32 s10, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s11, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s12, -1, 0x10000
-; GISEL-NEXT:    s_bfe_i32 s13, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s4, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s5, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s7, -1, 0x10000
+; GISEL-NEXT:    s_bfe_i32 s8, -1, 0x10000
 ; GISEL-NEXT:    v_and_b32_e32 v0, s6, v0
 ; GISEL-NEXT:    v_and_b32_e32 v2, s6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
-; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT:    v_mov_b32_e32 v16, s4
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v19, v18
+; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT:    v_mov_b32_e32 v19, s5
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_mov_b32_e32 v15, s7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v14
-; GISEL-NEXT:    v_addc_u32_e64 v14, s[4:5], v8, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v4, v6
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v12, v4, v6
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v6
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v7, v17
-; GISEL-NEXT:    v_addc_u32_e64 v16, s[6:7], v11, v13, s[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v17, v9, v7
+; GISEL-NEXT:    v_mul_hi_u32 v14, v4, v6
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
+; GISEL-NEXT:    v_addc_u32_e32 v11, vcc, v11, v13, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v18, v9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v16
-; GISEL-NEXT:    v_mul_lo_u32 v19, v16, v17
-; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v17
-; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v18
-; GISEL-NEXT:    v_mul_lo_u32 v18, v7, v9
-; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], v19, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v10, s[6:7], v18, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v4, v6
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v14
-; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v15
-; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v15
-; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v4, v10
+; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v8
+; GISEL-NEXT:    v_mul_lo_u32 v18, v8, v12
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
+; GISEL-NEXT:    v_mul_lo_u32 v9, v9, v11
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT:    v_mul_hi_u32 v14, v7, v13
+; GISEL-NEXT:    v_mul_hi_u32 v13, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
+; GISEL-NEXT:    v_mul_lo_u32 v17, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v4
-; GISEL-NEXT:    v_add_i32_e64 v10, s[8:9], v18, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v5, s[8:9], v10, v5
-; GISEL-NEXT:    v_mov_b32_e32 v5, s10
-; GISEL-NEXT:    v_mov_b32_e32 v10, s11
-; GISEL-NEXT:    v_add_i32_e64 v8, s[10:11], v8, v12
-; GISEL-NEXT:    v_mov_b32_e32 v12, s12
-; GISEL-NEXT:    v_add_i32_e64 v11, s[10:11], v11, v13
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v15
-; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v17, s[8:9], v18, v17
-; GISEL-NEXT:    v_mul_lo_u32 v18, v14, v4
-; GISEL-NEXT:    v_mul_hi_u32 v14, v14, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v6, v4
-; GISEL-NEXT:    v_add_i32_e64 v13, s[8:9], v18, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[8:9], v13, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[8:9]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[8:9], v18, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v18, s[6:7], v19, v18
-; GISEL-NEXT:    v_mul_lo_u32 v19, v16, v9
-; GISEL-NEXT:    v_mul_hi_u32 v16, v16, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v19, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v15, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v19, v15
-; GISEL-NEXT:    v_mov_b32_e32 v19, s13
-; GISEL-NEXT:    v_add_i32_e64 v4, s[6:7], v4, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v9, s[6:7], v9, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[6:7]
-; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v13, v17
-; GISEL-NEXT:    v_add_i32_e64 v15, s[6:7], v15, v18
-; GISEL-NEXT:    v_add_i32_e64 v13, s[6:7], v14, v13
-; GISEL-NEXT:    v_add_i32_e64 v14, s[6:7], v16, v15
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
-; GISEL-NEXT:    v_addc_u32_e64 v11, vcc, v11, v14, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v8, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, 0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v13, v0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, 0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v15, v0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v17, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, 0, v6
-; GISEL-NEXT:    v_mul_lo_u32 v18, v2, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
+; GISEL-NEXT:    v_mul_lo_u32 v14, v8, v4
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v10, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v6, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v10, s[4:5], v18, v10
+; GISEL-NEXT:    v_mul_lo_u32 v18, v11, v9
+; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v12, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v12, s[4:5], v14, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v14
+; GISEL-NEXT:    v_mul_hi_u32 v17, v7, v9
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_mul_lo_u32 v11, 0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v14, v2, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, 0, v9
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v16, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v11, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v8, s[4:5], v8, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
+; GISEL-NEXT:    v_mov_b32_e32 v18, s8
+; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v14
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v6, 0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v8, v0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v11, v9, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, 0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, 0, v7
+; GISEL-NEXT:    v_mul_lo_u32 v12, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v14, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v17, v2, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_mul_lo_u32 v10, 0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v11, v2, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, 0, v9
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v13, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v7, s[4:5], v10, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v6, s[4:5], v6, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT:    v_mul_lo_u32 v13, v3, v4
-; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v7
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v14
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v12, 0, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_mul_lo_u32 v11, v1, v7
+; GISEL-NEXT:    v_mul_lo_u32 v13, 0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v11
-; GISEL-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GISEL-NEXT:    v_mul_lo_u32 v8, v1, v8
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v15, v6
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v16, v8
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v7
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT:    v_subb_u32_e64 v7, s[4:5], 0, v4, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v10
+; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GISEL-NEXT:    v_mul_lo_u32 v6, v1, v6
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v13, v6
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], 0, v4, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v4, s[4:5], 0, v4
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v14
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[6:7], 0, v6, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e64 v6, s[6:7], 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v2, s[4:5], v2, v11
+; GISEL-NEXT:    v_subb_u32_e64 v8, s[6:7], 0, v5, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e64 v5, s[6:7], 0, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v2, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[6:7]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[6:7]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, v16, v7, s[6:7]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v12, v11, vcc
-; GISEL-NEXT:    v_subbrev_u32_e64 v6, vcc, 0, v6, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v0, v3
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v15, v9, vcc
+; GISEL-NEXT:    v_subbrev_u32_e64 v5, vcc, 0, v5, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v0, v3
 ; GISEL-NEXT:    v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v2, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v2, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v12, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v11, v3
-; GISEL-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v4, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v14, v19, v14, vcc
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v6, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v11, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v12, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, v19, v11, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v10, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v18, v13, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v14, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v15, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v5, v15, s[4:5]
+; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v4, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_24bit:

diff  --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 1ce32129aad3..c94812199bd9 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -9249,8 +9249,8 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX6-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_movk_i32 s2, 0xfee0
-; GFX6-NEXT:    s_mov_b32 s3, 0x68958c89
+; GFX6-NEXT:    s_movk_i32 s4, 0xfee0
+; GFX6-NEXT:    s_mov_b32 s5, 0x68958c89
 ; GFX6-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -9259,15 +9259,14 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0
-; GFX6-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX6-NEXT:    s_mov_b32 s11, 0xf000
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s8, s4
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
+; GFX6-NEXT:    s_movk_i32 s8, 0x11f
+; GFX6-NEXT:    s_mov_b32 s9, 0x976a7377
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v3
@@ -9282,70 +9281,69 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s5
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GFX6-NEXT:    s_movk_i32 s2, 0x11f
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
-; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX6-NEXT:    s_mov_b32 s3, 0x976a7377
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
-; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX6-NEXT:    s_mov_b32 s4, 0x976a7376
-; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX6-NEXT:    v_mov_b32_e32 v5, s2
-; GFX6-NEXT:    s_mov_b32 s9, s5
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s8
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
+; GFX6-NEXT:    v_mov_b32_e32 v5, s8
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s9
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s7, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s6, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s3, v3
+; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s9, v3
 ; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX6-NEXT:    s_movk_i32 s3, 0x11e
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s3, v4
+; GFX6-NEXT:    s_movk_i32 s2, 0x11e
+; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s2, v4
+; GFX6-NEXT:    s_mov_b32 s9, 0x976a7376
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s4, v5
+; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s9, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s2, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s8, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
@@ -9353,19 +9351,19 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v6, s7
+; GFX6-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v2
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v3
+; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s9, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s2, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s8, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: udiv_i64_oddk_denom:
@@ -9374,8 +9372,8 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0xfee0
-; GFX9-NEXT:    s_mov_b32 s5, 0x68958c89
+; GFX9-NEXT:    s_movk_i32 s2, 0xfee0
+; GFX9-NEXT:    s_mov_b32 s3, 0x68958c89
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -9384,10 +9382,11 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s5
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s3
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
@@ -9404,33 +9403,32 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v7, v2, s5
-; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s5
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v9
-; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v10, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v9
-; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v9
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s3
+; GFX9-NEXT:    s_movk_i32 s2, 0x11f
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v6
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GFX9-NEXT:    s_mov_b32 s3, 0x976a7377
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
@@ -9441,8 +9439,6 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    s_movk_i32 s2, 0x11f
-; GFX9-NEXT:    s_mov_b32 s3, 0x976a7377
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
@@ -9493,22 +9489,22 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
 ; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX90A-NEXT:    s_movk_i32 s2, 0xfee0
-; GFX90A-NEXT:    s_mov_b32 s3, 0x68958c89
-; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_mov_b32 s0, 0x68958c89
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s3
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s3
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s0
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
@@ -9523,32 +9519,30 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, s3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s3
-; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, s3
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v0, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v4, v0, s2
+; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, s0
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s0
+; GFX90A-NEXT:    v_add_u32_e32 v4, v5, v4
+; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s0
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
@@ -9783,7 +9777,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_movk_i32 s2, 0xf001
+; GFX6-NEXT:    s_movk_i32 s6, 0xf001
 ; GFX6-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9793,12 +9787,13 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s2
-; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s2
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
+; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s6
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[0:1], 12
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
@@ -9815,43 +9810,40 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX6-NEXT:    v_mul_hi_u32 v4, v0, s2
-; GFX6-NEXT:    v_mul_lo_u32 v5, v2, s2
-; GFX6-NEXT:    v_mul_lo_u32 v6, v0, s2
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshr_b64 s[2:3], s[8:9], 12
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v6
-; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v6
-; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v6, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GFX6-NEXT:    v_mul_hi_u32 v4, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v5, s11, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
+; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s6
+; GFX6-NEXT:    s_movk_i32 s0, 0xfff
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT:    s_movk_i32 s0, 0xfff
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
@@ -9865,8 +9857,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, s11
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s10, v8
+; GFX6-NEXT:    v_mov_b32_e32 v5, s3
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
 ; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
@@ -9885,8 +9877,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v0, v1, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -9895,7 +9887,7 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0xf001
+; GFX9-NEXT:    s_movk_i32 s2, 0xf001
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9904,10 +9896,12 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    s_movk_i32 s8, 0xfff
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s4
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s4
-; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s4
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s2
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, s2
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v3
@@ -9924,34 +9918,32 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX9-NEXT:    v_mul_hi_u32 v4, v0, s4
-; GFX9-NEXT:    v_mul_lo_u32 v6, v2, s4
-; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s4
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v8
-; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v9, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v7, v10, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v8
-; GFX9-NEXT:    v_mul_hi_u32 v8, v2, v8
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v9, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s2
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s2
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], 12
+; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v8, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX9-NEXT:    s_movk_i32 s4, 0xffe
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v1
@@ -9961,8 +9953,6 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    s_lshr_b64 s[2:3], s[4:5], 12
-; GFX9-NEXT:    s_movk_i32 s4, 0xffe
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
@@ -10002,7 +9992,8 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX90A-NEXT:    v_madak_f32 v0, 0, v0, 0x457ff000
 ; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_movk_i32 s8, 0xf001
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -10011,13 +10002,14 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, s8
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_lshr_b64 s[0:1], s[4:5], 12
+; GFX90A-NEXT:    s_movk_i32 s4, 0xf001
+; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, s4
 ; GFX90A-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s8
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
 ; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s8
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s4
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
@@ -10032,32 +10024,29 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, s8
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v2, s8
-; GFX90A-NEXT:    v_sub_u32_e32 v6, v6, v0
-; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, s8
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v0, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v2, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v2, v9
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s4
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s4
+; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
+; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s4
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v1
@@ -10072,18 +10061,18 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v2, vcc
-; GFX90A-NEXT:    s_movk_i32 s0, 0xfff
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s0
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s0
+; GFX90A-NEXT:    s_movk_i32 s4, 0xfff
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s4
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s4
 ; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s0
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s4
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s0, v3
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s4, v3
 ; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX90A-NEXT:    s_movk_i32 s0, 0xffe
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
+; GFX90A-NEXT:    s_movk_i32 s4, 0xffe
+; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
@@ -10091,16 +10080,15 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v3
+; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v3, vcc
-; GFX90A-NEXT:    s_lshr_b64 s[4:5], s[4:5], 12
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v0, v5, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v1, v6, vcc
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX90A-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
@@ -10227,56 +10215,54 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v4, v3, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v6, v2, s3
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s3
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v11, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
 ; GFX6-NEXT:    s_movk_i32 s4, 0x11f
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v10, v2, v5
-; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; GFX6-NEXT:    s_mov_b32 s9, s5
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
-; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX6-NEXT:    s_movk_i32 s5, 0x11e
-; GFX6-NEXT:    s_mov_b32 s11, 0xf000
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v6, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX6-NEXT:    s_movk_i32 s5, 0x11e
+; GFX6-NEXT:    s_mov_b32 s11, 0xf000
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s6, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s6, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s7, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s7, v1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GFX6-NEXT:    s_mov_b32 s10, -1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, v0, s4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s12
 ; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s12
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s12
-; GFX6-NEXT:    s_mov_b32 s10, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s7, v1
@@ -10318,8 +10304,8 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0xfee0
-; GFX9-NEXT:    s_mov_b32 s5, 0x689e0837
+; GFX9-NEXT:    s_movk_i32 s2, 0xfee0
+; GFX9-NEXT:    s_mov_b32 s3, 0x689e0837
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -10328,12 +10314,12 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s3
 ; GFX9-NEXT:    s_movk_i32 s8, 0x11f
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s4
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s5
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s5
-; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
@@ -10345,39 +10331,37 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GFX9-NEXT:    s_mov_b32 s9, 0x9761f7c9
 ; GFX9-NEXT:    s_mov_b32 s10, 0x9761f7c8
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s5
-; GFX9-NEXT:    v_mul_lo_u32 v7, v2, s5
-; GFX9-NEXT:    v_mul_lo_u32 v9, v0, s5
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
-; GFX9-NEXT:    v_add_u32_e32 v4, v4, v7
-; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v9
-; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v8, v10, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v10, v2, v9
-; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v9
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, s2
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX9-NEXT:    v_mul_lo_u32 v6, v0, s3
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v6
+; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v6
+; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v4, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v8, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
@@ -10439,22 +10423,22 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_madmk_f32 v0, v1, 0x438f8000, v0
 ; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX90A-NEXT:    s_movk_i32 s2, 0xfee0
-; GFX90A-NEXT:    s_mov_b32 s3, 0x689e0837
-; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_mov_b32 s0, 0x689e0837
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s3
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s3
+; GFX90A-NEXT:    v_mul_lo_u32 v4, v1, s0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s3
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s0
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
@@ -10469,32 +10453,30 @@ define amdgpu_kernel void @urem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, s3
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s3
-; GFX90A-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v0, s3
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v0, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v10, v7
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v10, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v6, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v4, v0, s2
+; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, s0
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s0
+; GFX90A-NEXT:    v_add_u32_e32 v4, v5, v4
+; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s0
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v1, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
@@ -10824,7 +10806,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s2, 0xffed2705
+; GFX6-NEXT:    s_mov_b32 s5, 0xffed2705
 ; GFX6-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -10833,14 +10815,14 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s2
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s5
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
+; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s8
+; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX6-NEXT:    s_add_u32 s2, s2, s8
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
@@ -10857,69 +10839,66 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v4, v2, s2
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
-; GFX6-NEXT:    s_mov_b32 s5, s9
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
-; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
-; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
-; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s10, s2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    s_mov_b32 s3, s2
-; GFX6-NEXT:    s_addc_u32 s1, s11, s2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s5
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s5
+; GFX6-NEXT:    s_mov_b32 s9, s8
+; GFX6-NEXT:    s_addc_u32 s3, s3, s8
+; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s5
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v9, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX6-NEXT:    s_mov_b32 s3, 0x12d8fb
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s3
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s0
+; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s0
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mov_b32_e32 v5, s1
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
+; GFX6-NEXT:    v_mov_b32_e32 v5, s3
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s3, v8
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s0, v8
 ; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
 ; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
 ; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX6-NEXT:    s_mov_b32 s5, s1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
 ; GFX6-NEXT:    v_cmp_lt_u32_e64 s[0:1], s0, v8
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
@@ -10931,10 +10910,11 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -10944,7 +10924,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s8, 0xffed2705
+; GFX9-NEXT:    s_mov_b32 s4, 0xffed2705
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -10953,13 +10933,10 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
-; GFX9-NEXT:    s_mov_b32 s1, s0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
@@ -10976,34 +10953,35 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v4, v2, s8
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
-; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
-; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
-; GFX9-NEXT:    s_add_u32 s2, s6, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_addc_u32 s3, s7, s0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_u32 s2, s2, s4
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v7, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_addc_u32 s3, s3, s4
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v1
@@ -11013,23 +10991,23 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX9-NEXT:    s_mov_b32 s1, 0x12d8fb
+; GFX9-NEXT:    s_mov_b32 s5, 0x12d8fb
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s1
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s1
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s1
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s5
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s2, v4
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s1, v4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s5, v4
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX9-NEXT:    s_mov_b32 s1, 0x12d8fa
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v3
+; GFX9-NEXT:    s_mov_b32 s2, 0x12d8fa
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
@@ -11037,19 +11015,19 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1, 2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v4
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v4, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: sdiv_i64_oddk_denom:
@@ -11057,7 +11035,7 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX90A-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_mov_b32 s2, 0xffed2705
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffed2705
 ; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -11066,12 +11044,12 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s2
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s4
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s4
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
@@ -11086,37 +11064,35 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, s2
-; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_sub_u32_e32 v5, v5, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v12
-; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, v8, v11, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v11, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s4
+; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, s4
+; GFX90A-NEXT:    v_mul_hi_u32 v6, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v7, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v0, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v10
+; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v8, v9, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v6, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s0, s7, 31
-; GFX90A-NEXT:    s_add_u32 s2, s6, s0
+; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX90A-NEXT:    s_add_u32 s2, s2, s4
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    s_mov_b32 s1, s0
-; GFX90A-NEXT:    s_addc_u32 s3, s7, s0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX90A-NEXT:    s_mov_b32 s5, s4
+; GFX90A-NEXT:    s_addc_u32 s3, s3, s4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s2, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v1
@@ -11131,18 +11107,18 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s3, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
-; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fb
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s1
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s1
+; GFX90A-NEXT:    s_mov_b32 s5, 0x12d8fb
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s5
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s5
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, v0, s1
+; GFX90A-NEXT:    v_mul_lo_u32 v4, v0, s5
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s3
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v4, vcc, s2, v4
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v3, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s1, v4
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s5, v4
 ; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc
-; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fa
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v5
+; GFX90A-NEXT:    s_mov_b32 s2, 0x12d8fa
+; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
@@ -11150,19 +11126,19 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v4
+; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v4
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v3, -1, v4, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX90A-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s4
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
   %r = sdiv i64 %x, 1235195
   store i64 %r, i64 addrspace(1)* %out
@@ -11236,35 +11212,38 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-LABEL: sdiv_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
-; GFX6-NEXT:    s_ashr_i32 s2, s5, 31
-; GFX6-NEXT:    s_add_u32 s4, s4, s2
-; GFX6-NEXT:    s_mov_b32 s3, s2
-; GFX6-NEXT:    s_addc_u32 s5, s5, s2
-; GFX6-NEXT:    s_xor_b64 s[12:13], s[4:5], s[2:3]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GFX6-NEXT:    s_sub_u32 s4, 0, s12
-; GFX6-NEXT:    s_subb_u32 s5, 0, s13
-; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
+; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX6-NEXT:    s_add_u32 s2, s2, s8
+; GFX6-NEXT:    s_mov_b32 s9, s8
+; GFX6-NEXT:    s_addc_u32 s3, s3, s8
+; GFX6-NEXT:    s_xor_b64 s[10:11], s[2:3], s[8:9]
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX6-NEXT:    s_sub_u32 s4, 0, s10
+; GFX6-NEXT:    s_subb_u32 s5, 0, s11
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s15, s14
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    s_add_u32 s2, s2, s12
+; GFX6-NEXT:    s_mov_b32 s13, s12
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
+; GFX6-NEXT:    s_addc_u32 s3, s3, s12
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
@@ -11276,6 +11255,7 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
@@ -11283,68 +11263,63 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v5, s4, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, s4, v0
-; GFX6-NEXT:    v_mul_lo_u32 v8, s5, v0
-; GFX6-NEXT:    s_mov_b32 s5, s9
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GFX6-NEXT:    v_mul_lo_u32 v7, s4, v0
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
-; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s10, s14
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    s_addc_u32 s1, s11, s14
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
-; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v9, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v5, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v7, s3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT:    s_mov_b32 s4, s8
+; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s13, v0
-; GFX6-NEXT:    v_mov_b32_e32 v5, s13
+; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s11, v0
+; GFX6-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GFX6-NEXT:    v_mul_lo_u32 v3, s12, v0
+; GFX6-NEXT:    v_mul_lo_u32 v3, s10, v0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s12, v3
+; GFX6-NEXT:    v_subrev_i32_e64 v5, s[0:1], s10, v3
 ; GFX6-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v5
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GFX6-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
@@ -11352,18 +11327,18 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v6, s11
+; GFX6-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[14:15], s[2:3]
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[8:9]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -11379,16 +11354,16 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX9-NEXT:    s_ashr_i32 s8, s3, 31
-; GFX9-NEXT:    s_add_u32 s2, s2, s8
-; GFX9-NEXT:    s_mov_b32 s9, s8
-; GFX9-NEXT:    s_addc_u32 s3, s3, s8
-; GFX9-NEXT:    s_xor_b64 s[10:11], s[2:3], s[8:9]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
-; GFX9-NEXT:    s_sub_u32 s12, 0, s10
-; GFX9-NEXT:    s_subb_u32 s4, 0, s11
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[2:3], s4
+; GFX9-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX9-NEXT:    s_add_u32 s4, s4, s2
+; GFX9-NEXT:    s_mov_b32 s3, s2
+; GFX9-NEXT:    s_addc_u32 s5, s5, s2
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[4:5], s[2:3]
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GFX9-NEXT:    s_sub_u32 s10, 0, s8
+; GFX9-NEXT:    s_subb_u32 s4, 0, s9
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -11397,10 +11372,10 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s12, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s12, v0
+; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s12, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v0
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
@@ -11418,39 +11393,37 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v5, s12, v3
-; GFX9-NEXT:    v_mul_hi_u32 v7, s12, v0
-; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v9, s12, v0
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v0
+; GFX9-NEXT:    v_mul_lo_u32 v7, s10, v0
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
-; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
-; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v9
-; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v9
-; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
+; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v7
+; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v7
+; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v7
+; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
-; GFX9-NEXT:    s_add_u32 s0, s6, s2
+; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v4, vcc
+; GFX9-NEXT:    s_add_u32 s0, s6, s10
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_addc_u32 s1, s7, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
+; GFX9-NEXT:    s_mov_b32 s11, s10
+; GFX9-NEXT:    s_addc_u32 s1, s7, s10
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
@@ -11465,39 +11438,39 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v3, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s11, v0
-; GFX9-NEXT:    v_mov_b32_e32 v6, s11
+; GFX9-NEXT:    v_mul_lo_u32 v3, s8, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s8, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s9, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s9
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, s10, v0
+; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v0
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
 ; GFX9-NEXT:    v_sub_u32_e32 v5, s7, v3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v4
 ; GFX9-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s10, v4
+; GFX9-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s8, v4
 ; GFX9-NEXT:    v_subbrev_co_u32_e64 v5, s[0:1], 0, v5, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v5
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v6
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v6
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, v7, v6, s[0:1]
 ; GFX9-NEXT:    v_mov_b32_e32 v7, s7
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s11, v3
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s10, v4
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s8, v4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 1, 2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
 ; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v0, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v7, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], 0, v1, s[0:1]
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[10:11], s[2:3]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -11521,22 +11494,25 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    s_xor_b64 s[8:9], s[4:5], s[2:3]
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX90A-NEXT:    s_sub_u32 s10, 0, s8
-; GFX90A-NEXT:    s_subb_u32 s11, 0, s9
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_sub_u32 s0, 0, s8
+; GFX90A-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s10, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s11, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s1, v0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s10, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s0, v0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
@@ -11553,37 +11529,32 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s10, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s10, v0
-; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s11, v0
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s1, v0
+; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v0, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v10
+; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v7, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v4, vcc
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
@@ -11750,7 +11721,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_movk_i32 s6, 0xf001
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
@@ -11759,13 +11730,13 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s0, s9, 31
-; GFX6-NEXT:    s_lshr_b32 s0, s0, 20
+; GFX6-NEXT:    s_ashr_i32 s8, s1, 31
+; GFX6-NEXT:    s_lshr_b32 s8, s8, 20
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s6
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v1, s6
-; GFX6-NEXT:    s_add_u32 s2, s8, s0
-; GFX6-NEXT:    s_addc_u32 s3, s9, 0
-; GFX6-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
+; GFX6-NEXT:    s_add_u32 s0, s0, s8
+; GFX6-NEXT:    s_addc_u32 s1, s1, 0
+; GFX6-NEXT:    s_ashr_i64 s[8:9], s[0:1], 12
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
@@ -11778,8 +11749,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX6-NEXT:    s_ashr_i32 s8, s11, 31
-; GFX6-NEXT:    s_mov_b32 s9, s8
+; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
+; GFX6-NEXT:    s_add_u32 s0, s2, s10
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
@@ -11787,34 +11758,32 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v5, v2, s6
-; GFX6-NEXT:    v_mul_hi_u32 v7, v0, s6
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GFX6-NEXT:    v_mul_lo_u32 v7, v0, s6
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, v0, v5
-; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
-; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s10, s8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    s_addc_u32 s1, s11, s8
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[8:9]
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s6
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s6
+; GFX6-NEXT:    s_mov_b32 s11, s10
+; GFX6-NEXT:    s_addc_u32 s1, s3, s10
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[10:11]
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s6
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v9, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v5, s0, v1
@@ -11824,17 +11793,17 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s1, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX6-NEXT:    s_movk_i32 s9, 0xfff
+; GFX6-NEXT:    s_movk_i32 s2, 0xfff
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s9
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v4, v1, s2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, 2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, s2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v1, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
@@ -11842,7 +11811,7 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_mov_b32_e32 v5, s1
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s0, v8
 ; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s9, v8
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s2, v8
 ; GFX6-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
 ; GFX6-NEXT:    s_movk_i32 s0, 0xffe
 ; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v5
@@ -11859,13 +11828,13 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
-; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s8
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_xor_b32_e32 v1, s10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s10
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s10, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v3, vcc, v1, v3, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v0, s2
-; GFX6-NEXT:    v_mov_b32_e32 v1, s3
+; GFX6-NEXT:    v_mov_b32_e32 v0, s8
+; GFX6-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX6-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
 ;
@@ -11876,8 +11845,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    s_movk_i32 s8, 0xf001
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
@@ -11890,8 +11859,8 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v1, s8
 ; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
-; GFX9-NEXT:    s_add_u32 s4, s4, s2
-; GFX9-NEXT:    s_addc_u32 s5, s5, 0
+; GFX9-NEXT:    s_add_u32 s2, s4, s2
+; GFX9-NEXT:    s_addc_u32 s3, s5, 0
 ; GFX9-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_lo_u32 v3, v0, v2
@@ -11903,44 +11872,41 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v5
-; GFX9-NEXT:    s_ashr_i64 s[4:5], s[4:5], 12
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
+; GFX9-NEXT:    s_ashr_i32 s4, s7, 31
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v5, v2, s8
-; GFX9-NEXT:    v_mul_hi_u32 v7, v0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_sub_u32_e32 v5, v5, v0
-; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
-; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
-; GFX9-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
-; GFX9-NEXT:    s_ashr_i32 s2, s7, 31
-; GFX9-NEXT:    s_add_u32 s6, s6, s2
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_mov_b32 s3, s2
-; GFX9-NEXT:    s_addc_u32 s7, s7, s2
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
+; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s8
+; GFX9-NEXT:    s_add_u32 s6, s6, s4
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v7, v1, v5
+; GFX9-NEXT:    v_mul_lo_u32 v5, v1, v5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v8, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v7, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    s_addc_u32 s7, s7, s4
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
@@ -11950,23 +11916,24 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v5, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v5, s7, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s7, v0
-; GFX9-NEXT:    s_movk_i32 s3, 0xfff
+; GFX9-NEXT:    s_movk_i32 s5, 0xfff
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s3
-; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s3
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s5
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v5, v0, s5
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s7
 ; GFX9-NEXT:    v_sub_co_u32_e32 v5, vcc, s6, v5
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v2, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s3, v5
+; GFX9-NEXT:    v_subrev_co_u32_e32 v3, vcc, s5, v5
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX9-NEXT:    s_movk_i32 s3, 0xffe
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v3
+; GFX9-NEXT:    s_movk_i32 s5, 0xffe
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
@@ -11974,20 +11941,20 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 1, 2, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v0, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s3, v5
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v5
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, -1, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, s2
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, s4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s4, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -11998,7 +11965,6 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0, v1
 ; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_movk_i32 s8, 0xf001
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
@@ -12011,11 +11977,13 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_ashr_i32 s0, s5, 31
 ; GFX90A-NEXT:    s_lshr_b32 s0, s0, 20
-; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, s8
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s8
+; GFX90A-NEXT:    s_add_u32 s0, s4, s0
+; GFX90A-NEXT:    s_movk_i32 s4, 0xf001
+; GFX90A-NEXT:    v_mul_hi_u32 v2, v0, s4
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
 ; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
 ; GFX90A-NEXT:    v_sub_u32_e32 v2, v2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s8
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s4
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
@@ -12026,44 +11994,41 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
-; GFX90A-NEXT:    s_add_u32 s0, s4, s0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
-; GFX90A-NEXT:    s_addc_u32 s1, s5, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
-; GFX90A-NEXT:    s_ashr_i64 s[4:5], s[0:1], 12
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v2
-; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v2, s8
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, s8
-; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX90A-NEXT:    v_sub_u32_e32 v5, v5, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v8, v0, s8
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v2, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v10, v2, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
-; GFX90A-NEXT:    s_ashr_i32 s0, s7, 31
-; GFX90A-NEXT:    s_add_u32 s6, s6, s0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX90A-NEXT:    s_mov_b32 s1, s0
-; GFX90A-NEXT:    s_addc_u32 s7, s7, s0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s4
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s4
+; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX90A-NEXT:    v_sub_u32_e32 v2, v2, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, s4
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v0, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v0, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v10
+; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v7, vcc
+; GFX90A-NEXT:    s_addc_u32 s1, s5, 0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX90A-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
+; GFX90A-NEXT:    s_ashr_i32 s4, s7, 31
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
+; GFX90A-NEXT:    s_add_u32 s6, s6, s4
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT:    s_mov_b32 s5, s4
+; GFX90A-NEXT:    s_addc_u32 s7, s7, s4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v2, s6, v1
@@ -12078,18 +12043,18 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
-; GFX90A-NEXT:    s_movk_i32 s1, 0xfff
-; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s1
+; GFX90A-NEXT:    s_movk_i32 s5, 0xfff
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, s5
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s5
 ; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s1
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v0, s5
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v3, vcc, s6, v3
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s1, v3
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s5, v3
 ; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v2, vcc
-; GFX90A-NEXT:    s_movk_i32 s1, 0xffe
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v5
+; GFX90A-NEXT:    s_movk_i32 s5, 0xffe
+; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
@@ -12097,20 +12062,20 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(<2 x i64> addrspace(1)*
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 1, 2, vcc
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v0, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v3
+; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v2, -1, v3, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v0
+; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX90A-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s4
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s4, v0
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX90A-NEXT:    v_mov_b32_e32 v0, s4
-; GFX90A-NEXT:    v_mov_b32_e32 v1, s5
+; GFX90A-NEXT:    v_mov_b32_e32 v0, s0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX90A-NEXT:    s_endpgm
   %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
@@ -12140,42 +12105,49 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    s_mov_b32 s19, 0x5f7ffffc
 ; GFX6-NEXT:    s_mov_b32 s20, 0x2f800000
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_lshl_b64 s[12:13], s[2:3], s6
+; GFX6-NEXT:    s_lshl_b64 s[8:9], s[2:3], s6
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], s4
-; GFX6-NEXT:    s_ashr_i32 s16, s3, 31
-; GFX6-NEXT:    s_add_u32 s2, s2, s16
-; GFX6-NEXT:    s_mov_b32 s17, s16
-; GFX6-NEXT:    s_addc_u32 s3, s3, s16
-; GFX6-NEXT:    s_xor_b64 s[14:15], s[2:3], s[16:17]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s14
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX6-NEXT:    s_ashr_i32 s12, s3, 31
+; GFX6-NEXT:    s_add_u32 s2, s2, s12
+; GFX6-NEXT:    s_mov_b32 s13, s12
+; GFX6-NEXT:    s_addc_u32 s3, s3, s12
+; GFX6-NEXT:    s_xor_b64 s[10:11], s[2:3], s[12:13]
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
-; GFX6-NEXT:    s_sub_u32 s6, 0, s14
-; GFX6-NEXT:    s_subb_u32 s7, 0, s15
+; GFX6-NEXT:    s_sub_u32 s6, 0, s10
+; GFX6-NEXT:    s_subb_u32 s7, 0, s11
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0xd
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s21, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_ashr_i32 s14, s1, 31
+; GFX6-NEXT:    s_add_u32 s0, s0, s14
 ; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v3
 ; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v3
+; GFX6-NEXT:    s_mov_b32 s15, s14
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v4
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v3, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v1
+; GFX6-NEXT:    s_addc_u32 s1, s1, s14
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GFX6-NEXT:    s_xor_b64 s[16:17], s[0:1], s[14:15]
+; GFX6-NEXT:    s_xor_b64 s[14:15], s[14:15], s[12:13]
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v1
@@ -12184,92 +12156,82 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v6, vcc
-; GFX6-NEXT:    v_add_i32_e64 v3, s[2:3], v3, v4
-; GFX6-NEXT:    v_addc_u32_e64 v4, vcc, v2, v5, s[2:3]
-; GFX6-NEXT:    v_mul_lo_u32 v6, s6, v4
-; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
-; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v3
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v3
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GFX6-NEXT:    v_mul_lo_u32 v10, v3, v6
-; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v7
-; GFX6-NEXT:    v_mul_hi_u32 v12, v3, v6
-; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v8, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v1, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v2, v6, s[2:3]
-; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_ashr_i32 s2, s9, 31
-; GFX6-NEXT:    s_add_u32 s0, s8, s2
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    s_mov_b32 s3, s2
-; GFX6-NEXT:    s_addc_u32 s1, s9, s2
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GFX6-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
-; GFX6-NEXT:    v_mul_lo_u32 v4, s8, v2
-; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v3
-; GFX6-NEXT:    v_mul_hi_u32 v6, s8, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, s9, v2
-; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v2
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v2, v5, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s6, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, s6, v3
+; GFX6-NEXT:    v_mul_lo_u32 v6, s7, v3
+; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
+; GFX6-NEXT:    s_add_u32 s8, s8, s12
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v3
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v6, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v2, v5, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s16, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, s16, v3
+; GFX6-NEXT:    v_mul_hi_u32 v6, s16, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, s17, v2
+; GFX6-NEXT:    v_mul_lo_u32 v2, s17, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, s9, v3
-; GFX6-NEXT:    v_mul_hi_u32 v3, s9, v3
-; GFX6-NEXT:    s_xor_b64 s[16:17], s[2:3], s[16:17]
-; GFX6-NEXT:    s_ashr_i32 s2, s13, 31
+; GFX6-NEXT:    v_mul_lo_u32 v6, s17, v3
+; GFX6-NEXT:    v_mul_hi_u32 v3, s17, v3
+; GFX6-NEXT:    s_mov_b32 s13, s12
+; GFX6-NEXT:    s_addc_u32 s9, s9, s12
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v1, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s14, v3
-; GFX6-NEXT:    v_mul_hi_u32 v5, s14, v2
-; GFX6-NEXT:    v_mul_lo_u32 v6, s15, v2
-; GFX6-NEXT:    v_mov_b32_e32 v7, s15
-; GFX6-NEXT:    s_mov_b32 s3, s2
+; GFX6-NEXT:    v_mul_lo_u32 v4, s10, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, s11, v2
+; GFX6-NEXT:    v_mov_b32_e32 v7, s11
+; GFX6-NEXT:    s_xor_b64 s[8:9], s[8:9], s[12:13]
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mul_lo_u32 v5, s14, v2
+; GFX6-NEXT:    v_mul_lo_u32 v5, s10, v2
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s9, v4
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s8, v5
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s17, v4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s16, v5
 ; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s14, v5
+; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s10, v5
 ; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s15, v6
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v7
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s15, v6
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e64 v7, s[0:1], 2, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v3, s[0:1]
 ; GFX6-NEXT:    v_add_i32_e64 v9, s[0:1], 1, v2
 ; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v3, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
-; GFX6-NEXT:    s_add_u32 s8, s12, s2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v8, s9
-; GFX6-NEXT:    s_addc_u32 s9, s13, s2
-; GFX6-NEXT:    s_xor_b64 s[8:9], s[8:9], s[2:3]
+; GFX6-NEXT:    v_mov_b32_e32 v8, s17
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v10, s8
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v11, s9
 ; GFX6-NEXT:    v_subb_u32_e32 v4, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s15, v4
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s11, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s14, v5
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s10, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s15, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v4
 ; GFX6-NEXT:    v_mac_f32_e32 v10, s18, v11
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v8, v5, vcc
 ; GFX6-NEXT:    v_rcp_f32_e32 v5, v10
@@ -12282,15 +12244,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mac_f32_e32 v5, s21, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX6-NEXT:    s_sub_u32 s12, 0, s8
+; GFX6-NEXT:    s_sub_u32 s0, 0, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v4, s12, v5
-; GFX6-NEXT:    v_mul_lo_u32 v7, s12, v6
-; GFX6-NEXT:    s_subb_u32 s13, 0, s9
-; GFX6-NEXT:    v_mul_lo_u32 v8, s13, v5
-; GFX6-NEXT:    v_xor_b32_e32 v2, s16, v2
+; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v5
+; GFX6-NEXT:    v_mul_lo_u32 v7, s0, v6
+; GFX6-NEXT:    s_subb_u32 s1, 0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v8, s1, v5
+; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, s12, v5
+; GFX6-NEXT:    v_mul_lo_u32 v7, s0, v5
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v5, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v5, v7
@@ -12301,54 +12263,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v10, v6, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GFX6-NEXT:    v_xor_b32_e32 v3, s17, v3
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    s_mov_b32 s11, s10
+; GFX6-NEXT:    v_xor_b32_e32 v2, s14, v2
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v1, v8, vcc
-; GFX6-NEXT:    v_add_i32_e64 v4, s[0:1], v5, v4
-; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v6, v7, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v8, s12, v5
-; GFX6-NEXT:    v_mul_hi_u32 v9, s12, v4
-; GFX6-NEXT:    v_mul_lo_u32 v10, s13, v4
-; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GFX6-NEXT:    v_mul_lo_u32 v9, s12, v4
-; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GFX6-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GFX6-NEXT:    v_mul_hi_u32 v13, v4, v9
-; GFX6-NEXT:    v_mul_hi_u32 v14, v4, v8
-; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v9
-; GFX6-NEXT:    v_mul_lo_u32 v9, v5, v9
-; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v8
-; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, v5, v8
-; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GFX6-NEXT:    s_ashr_i32 s12, s11, 31
-; GFX6-NEXT:    v_addc_u32_e64 v6, vcc, v6, v8, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s10, s12
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GFX6-NEXT:    s_mov_b32 s13, s12
-; GFX6-NEXT:    s_addc_u32 s1, s11, s12
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
-; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[12:13]
-; GFX6-NEXT:    v_mul_lo_u32 v6, s10, v5
-; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, s10, v5
-; GFX6-NEXT:    v_mul_hi_u32 v10, s11, v5
-; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v5
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v7, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v5
+; GFX6-NEXT:    v_mul_hi_u32 v7, s0, v4
+; GFX6-NEXT:    v_mul_lo_u32 v8, s1, v4
+; GFX6-NEXT:    v_xor_b32_e32 v3, s15, v3
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_mul_lo_u32 v7, s0, v4
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GFX6-NEXT:    v_mul_lo_u32 v10, v4, v6
+; GFX6-NEXT:    v_mul_hi_u32 v11, v4, v7
+; GFX6-NEXT:    v_mul_hi_u32 v12, v4, v6
+; GFX6-NEXT:    v_mul_hi_u32 v9, v5, v7
+; GFX6-NEXT:    v_mul_lo_u32 v7, v5, v7
+; GFX6-NEXT:    v_mul_hi_u32 v8, v5, v6
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v1, v8, vcc
+; GFX6-NEXT:    s_add_u32 s0, s2, s10
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
+; GFX6-NEXT:    s_addc_u32 s1, s3, s10
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GFX6-NEXT:    s_xor_b64 s[2:3], s[0:1], s[10:11]
+; GFX6-NEXT:    v_mul_lo_u32 v6, s2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v7, s2, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, s2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v10, s3, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v5
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, 0, v9, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v9, s11, v4
-; GFX6-NEXT:    v_mul_hi_u32 v4, s11, v4
-; GFX6-NEXT:    v_mov_b32_e32 v8, s17
+; GFX6-NEXT:    v_mul_lo_u32 v9, s3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v4, s3, v4
+; GFX6-NEXT:    v_mov_b32_e32 v8, s15
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v4, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v10, v0, vcc
@@ -12356,15 +12317,15 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v0, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s8, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v7, s8, v4
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s16, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v2, s9, v4
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v3, v8, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v7, v6
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v3, s8, v4
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s11, v2
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s3, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v7, s9
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GFX6-NEXT:    v_subb_u32_e64 v6, s[0:1], v6, v7, vcc
 ; GFX6-NEXT:    v_subrev_i32_e64 v7, s[0:1], s8, v3
 ; GFX6-NEXT:    v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1]
@@ -12380,7 +12341,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e64 v10, s[0:1], 0, v5, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v10, v8, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v8, s11
+; GFX6-NEXT:    v_mov_b32_e32 v8, s3
 ; GFX6-NEXT:    v_subb_u32_e32 v2, vcc, v8, v2, vcc
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
@@ -12391,7 +12352,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX6-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[12:13], s[2:3]
+; GFX6-NEXT:    s_xor_b64 s[0:1], s[10:11], s[12:13]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
 ; GFX6-NEXT:    v_xor_b32_e32 v3, s0, v3
 ; GFX6-NEXT:    v_xor_b32_e32 v4, s1, v2
@@ -12419,20 +12380,24 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
-; GFX9-NEXT:    s_sub_u32 s14, 0, s10
-; GFX9-NEXT:    s_subb_u32 s4, 0, s11
+; GFX9-NEXT:    s_sub_u32 s2, 0, s10
+; GFX9-NEXT:    s_subb_u32 s3, 0, s11
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s14, v2
-; GFX9-NEXT:    v_mul_hi_u32 v1, s14, v3
-; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
+; GFX9-NEXT:    s_mov_b32 s15, s14
+; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
+; GFX9-NEXT:    v_mul_hi_u32 v1, s2, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v5, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v3, v4
@@ -12451,38 +12416,32 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v3, s[2:3], v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v6, s14, v4
-; GFX9-NEXT:    v_mul_hi_u32 v7, s14, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v3
-; GFX9-NEXT:    v_mul_lo_u32 v9, s14, v3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
-; GFX9-NEXT:    v_mul_lo_u32 v10, v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v9
-; GFX9-NEXT:    v_mul_hi_u32 v12, v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v9
-; GFX9-NEXT:    v_mul_lo_u32 v9, v4, v9
-; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v1, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s14, s5, 31
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v2
+; GFX9-NEXT:    v_mul_hi_u32 v5, s2, v3
+; GFX9-NEXT:    v_mul_lo_u32 v6, s3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
 ; GFX9-NEXT:    s_add_u32 s2, s4, s14
+; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v7
+; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v4
+; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v7
+; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v7
+; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, v2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v9, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    s_mov_b32 s15, s14
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s14
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[2:3], s[14:15]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s4, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s4, v3
@@ -12494,7 +12453,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s5, v3
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s5, v3
 ; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX9-NEXT:    s_xor_b64 s[12:13], s[14:15], s[12:13]
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v0, vcc
@@ -12522,12 +12480,13 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cndmask_b32_e64 v6, 1, 2, s[0:1]
 ; GFX9-NEXT:    v_add_co_u32_e64 v6, s[0:1], v2, v6
 ; GFX9-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v3, s[0:1]
+; GFX9-NEXT:    s_xor_b64 s[0:1], s[14:15], s[12:13]
 ; GFX9-NEXT:    s_ashr_i32 s4, s9, 31
-; GFX9-NEXT:    s_add_u32 s0, s8, s4
+; GFX9-NEXT:    s_add_u32 s8, s8, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX9-NEXT:    s_mov_b32 s5, s4
-; GFX9-NEXT:    s_addc_u32 s1, s9, s4
-; GFX9-NEXT:    s_xor_b64 s[8:9], s[0:1], s[4:5]
+; GFX9-NEXT:    s_addc_u32 s9, s9, s4
+; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
 ; GFX9-NEXT:    v_subb_co_u32_e32 v4, vcc, v8, v4, vcc
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v8, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v9, s9
@@ -12565,44 +12524,42 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v5, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX9-NEXT:    v_xor_b32_e32 v2, s12, v2
-; GFX9-NEXT:    v_xor_b32_e32 v3, s13, v3
+; GFX9-NEXT:    v_xor_b32_e32 v2, s0, v2
+; GFX9-NEXT:    v_xor_b32_e32 v3, s1, v3
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v9, v7, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v0, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v7, v6
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v1, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v5, v7, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v8, s10, v6
-; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v4
-; GFX9-NEXT:    v_mul_lo_u32 v10, s11, v4
-; GFX9-NEXT:    v_mul_lo_u32 v11, s10, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
-; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
-; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v13, v4, v11
-; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v10, v6, v11
-; GFX9-NEXT:    v_mul_lo_u32 v11, v6, v11
-; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
-; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v10, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, s10, v5
+; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v4
+; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v4
 ; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v8, s[0:1]
-; GFX9-NEXT:    s_add_u32 s0, s6, s10
+; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v9
+; GFX9-NEXT:    v_mul_hi_u32 v12, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v5, v9
+; GFX9-NEXT:    v_mul_lo_u32 v9, v5, v9
+; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v1, v7, vcc
+; GFX9-NEXT:    s_add_u32 s6, s6, s10
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    s_mov_b32 s11, s10
-; GFX9-NEXT:    s_addc_u32 s1, s7, s10
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
+; GFX9-NEXT:    s_addc_u32 s7, s7, s10
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[10:11]
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s6, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v9, s6, v5
@@ -12612,7 +12569,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v9, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s7, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s7, v4
-; GFX9-NEXT:    v_mov_b32_e32 v8, s13
+; GFX9-NEXT:    v_mov_b32_e32 v8, s1
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v10, v0, vcc
@@ -12621,7 +12578,7 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s9, v4
-; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s12, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s0, v2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v8, vcc
 ; GFX9-NEXT:    v_add_u32_e32 v3, v7, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v4
@@ -12678,216 +12635,212 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s3, s3, s10
 ; GFX90A-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s12
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s13
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s13
 ; GFX90A-NEXT:    s_mov_b32 s19, 0xcf800000
-; GFX90A-NEXT:    s_sub_u32 s14, 0, s12
-; GFX90A-NEXT:    s_subb_u32 s15, 0, s13
-; GFX90A-NEXT:    v_mac_f32_e32 v1, s16, v2
-; GFX90A-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s17, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v2, s18, v1
-; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX90A-NEXT:    v_mac_f32_e32 v1, s19, v2
+; GFX90A-NEXT:    s_sub_u32 s0, 0, s12
+; GFX90A-NEXT:    v_mac_f32_e32 v0, s16, v1
+; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX90A-NEXT:    s_subb_u32 s1, 0, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
+; GFX90A-NEXT:    v_mul_f32_e32 v0, s17, v0
+; GFX90A-NEXT:    v_mul_f32_e32 v1, s18, v0
+; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX90A-NEXT:    v_mac_f32_e32 v0, s19, v1
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v4, s14, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s14, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s15, v1
-; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v5
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s14, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v1, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
+; GFX90A-NEXT:    s_mov_b32 s15, s14
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s0, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v2, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v2, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v2, v4, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s14, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s14, v1
-; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s15, v1
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s14, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v1, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v1, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
-; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v2, v5, s[0:1]
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v0, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v0, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v10
+; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v7, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
 ; GFX90A-NEXT:    s_add_u32 s0, s4, s14
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
-; GFX90A-NEXT:    s_mov_b32 s15, s14
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s4, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s4, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v2, s4, v1
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v1
+; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v2, v7, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v7, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s12, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v4, s12, v1
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s13, v1
-; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s12, v1
-; GFX90A-NEXT:    v_sub_u32_e32 v4, s5, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s12, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s12, v0
+; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s13, v0
+; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s12, v0
+; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, s13
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v5, vcc, s4, v5
-; GFX90A-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v7, vcc
+; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v7, vcc
 ; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s12, v5
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
+; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
+; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v7
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
-; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v8, v7, s[0:1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, 1, 2, s[0:1]
+; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v8, v7, s[0:1]
+; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, 1, 2, s[0:1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v8, s5
-; GFX90A-NEXT:    v_add_co_u32_e64 v4, s[0:1], v1, v4
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v8, v3, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v2, s[0:1]
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v3
+; GFX90A-NEXT:    v_add_co_u32_e64 v3, s[0:1], v0, v3
+; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v8, v2, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e64 v7, s[0:1], 0, v1, s[0:1]
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v5
 ; GFX90A-NEXT:    s_xor_b64 s[0:1], s[14:15], s[10:11]
 ; GFX90A-NEXT:    s_ashr_i32 s4, s9, 31
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v3
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
 ; GFX90A-NEXT:    s_add_u32 s8, s8, s4
-; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v8, v5, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v8, v5, vcc
 ; GFX90A-NEXT:    s_mov_b32 s5, s4
 ; GFX90A-NEXT:    s_addc_u32 s9, s9, s4
-; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GFX90A-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s8
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s9
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX90A-NEXT:    v_xor_b32_e32 v5, s1, v2
-; GFX90A-NEXT:    v_mac_f32_e32 v3, s16, v4
-; GFX90A-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v7, s1
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s17, v4
-; GFX90A-NEXT:    v_mul_f32_e32 v4, s18, v1
-; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX90A-NEXT:    v_mac_f32_e32 v1, s19, v4
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX90A-NEXT:    s_sub_u32 s10, 0, s8
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
-; GFX90A-NEXT:    s_subb_u32 s11, 0, s9
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s10, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s11, v1
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s9
+; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_mac_f32_e32 v2, s16, v3
+; GFX90A-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX90A-NEXT:    s_sub_u32 s0, 0, s8
+; GFX90A-NEXT:    v_xor_b32_e32 v1, s1, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s1
+; GFX90A-NEXT:    v_mul_f32_e32 v2, s17, v2
+; GFX90A-NEXT:    v_mul_f32_e32 v3, s18, v2
+; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX90A-NEXT:    v_mac_f32_e32 v2, s19, v3
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX90A-NEXT:    s_subb_u32 s1, 0, s9
+; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s0, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s0, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s1, v2
 ; GFX90A-NEXT:    v_add_u32_e32 v7, v7, v8
 ; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s10, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v8, v1, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v1, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v9, s0, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v2, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v4, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v4, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
+; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v4, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v6, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v5
-; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v4, v7, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s10, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s10, v1
-; GFX90A-NEXT:    v_add_u32_e32 v8, v9, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s11, v1
-; GFX90A-NEXT:    v_add_u32_e32 v8, v8, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v10, s10, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v5, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v14, v1, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v1, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v13, v1, v8
-; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v5, v8
-; GFX90A-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v5, v8
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v6, v9, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v7
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s0, v2
+; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s1, v2
+; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s0, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v12, v2, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v2, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v2, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
+; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v8, v5
 ; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v6, v7, vcc
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v4
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v4
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v7, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s7, v4
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v4
-; GFX90A-NEXT:    v_mul_hi_u32 v6, s8, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s7, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s7, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s8, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v6, s8, v2
 ; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s9, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s9, v2
 ; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s8, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s8, v2
 ; GFX90A-NEXT:    v_sub_u32_e32 v6, s7, v5
 ; GFX90A-NEXT:    v_mov_b32_e32 v8, s9
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v7, vcc, s6, v7
@@ -12909,19 +12862,19 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v6, 1, 2, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v5
-; GFX90A-NEXT:    v_add_co_u32_e64 v6, s[0:1], v1, v6
+; GFX90A-NEXT:    v_add_co_u32_e64 v6, s[0:1], v2, v6
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v7, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v4, s[0:1]
+; GFX90A-NEXT:    v_addc_co_u32_e64 v8, s[0:1], 0, v3, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[0:1], s[10:11], s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v4, v8, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX90A-NEXT:    v_xor_b32_e32 v5, s1, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s1
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v4, vcc, s0, v1
-; GFX90A-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
-; GFX90A-NEXT:    global_store_dwordx4 v0, v[2:5], s[2:3]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v8, vcc
+; GFX90A-NEXT:    v_xor_b32_e32 v2, s0, v2
+; GFX90A-NEXT:    v_xor_b32_e32 v3, s1, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s1
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
+; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y
@@ -12940,7 +12893,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX6-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s2, 0xffed2705
+; GFX6-NEXT:    s_mov_b32 s4, 0xffed2705
 ; GFX6-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX6-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -12949,14 +12902,14 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s2
-; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s2
-; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GFX6-NEXT:    s_mov_b32 s6, -1
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s4
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s4
+; GFX6-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, s8
+; GFX6-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX6-NEXT:    s_add_u32 s2, s2, s8
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GFX6-NEXT:    v_mul_lo_u32 v5, v0, v2
@@ -12973,62 +12926,60 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v4, v2, s2
-; GFX6-NEXT:    v_mul_hi_u32 v5, v0, s2
-; GFX6-NEXT:    s_mov_b32 s5, s9
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GFX6-NEXT:    v_mul_lo_u32 v5, v0, s2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
-; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v4
-; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v5
-; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
-; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, v8, v12, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v11, v9, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v6, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    s_ashr_i32 s2, s11, 31
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s10, s2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    s_mov_b32 s3, s2
-; GFX6-NEXT:    s_addc_u32 s1, s11, s2
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; GFX6-NEXT:    v_mul_lo_u32 v2, s0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s0, v0
-; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v1
-; GFX6-NEXT:    v_mul_hi_u32 v5, s1, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s1, v1
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, s4
+; GFX6-NEXT:    v_mul_hi_u32 v3, v0, s4
+; GFX6-NEXT:    s_mov_b32 s9, s8
+; GFX6-NEXT:    s_addc_u32 s3, s3, s8
+; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, v0, s4
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v6, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v9, v5, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v4, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, s1, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s1, v0
-; GFX6-NEXT:    s_mov_b32 s3, 0x12d8fb
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GFX6-NEXT:    v_mul_hi_u32 v5, s3, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s3, v1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GFX6-NEXT:    s_mov_b32 s4, s0
+; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fb
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v8, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s3
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, s0
+; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s0
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, s0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s1
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
+; GFX6-NEXT:    v_mov_b32_e32 v2, s3
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s3, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s0, v0
 ; GFX6-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s3, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s0, v2
 ; GFX6-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v3, vcc
 ; GFX6-NEXT:    s_mov_b32 s0, 0x12d8fa
 ; GFX6-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v2
@@ -13045,10 +12996,10 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; GFX6-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GFX6-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s2
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GFX6-NEXT:    v_xor_b32_e32 v1, s8, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s8
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -13058,7 +13009,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX9-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s8, 0xffed2705
+; GFX9-NEXT:    s_mov_b32 s4, 0xffed2705
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -13067,13 +13018,10 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s8
-; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s8
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s0, s7, 31
-; GFX9-NEXT:    s_mov_b32 s1, s0
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
 ; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v0, v4
@@ -13090,34 +13038,35 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v1, v3, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v4, v2, s8
-; GFX9-NEXT:    v_mul_hi_u32 v6, v0, s8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v0, s8
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_add_u32_e32 v4, v6, v4
-; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v8
-; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v4
-; GFX9-NEXT:    v_mul_hi_u32 v9, v2, v8
-; GFX9-NEXT:    v_mul_lo_u32 v8, v2, v8
-; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v7, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v4, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v4, s[2:3]
-; GFX9-NEXT:    s_add_u32 s2, s6, s0
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
-; GFX9-NEXT:    s_addc_u32 s3, s7, s0
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, s4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v0, s4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v0, s4
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX9-NEXT:    s_add_u32 s2, s2, s4
+; GFX9-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v4
+; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v2
+; GFX9-NEXT:    v_mul_hi_u32 v6, v1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v4, v1, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v7, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX9-NEXT:    s_mov_b32 s5, s4
+; GFX9-NEXT:    s_addc_u32 s3, s3, s4
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    v_mul_lo_u32 v2, s2, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v1
@@ -13127,44 +13076,44 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v7, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s3, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s3, v0
-; GFX9-NEXT:    s_mov_b32 s1, 0x12d8fb
+; GFX9-NEXT:    s_mov_b32 s5, 0x12d8fb
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s1
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s1
-; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s1
+; GFX9-NEXT:    v_mul_lo_u32 v1, v1, s5
+; GFX9-NEXT:    v_mul_hi_u32 v2, v0, s5
+; GFX9-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GFX9-NEXT:    v_add_u32_e32 v1, v2, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s3
 ; GFX9-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s1, v0
+; GFX9-NEXT:    v_subrev_co_u32_e32 v2, vcc, s5, v0
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s1, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v4, vcc, s5, v2
 ; GFX9-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v3, vcc
-; GFX9-NEXT:    s_mov_b32 s1, 0x12d8fa
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v2
+; GFX9-NEXT:    s_mov_b32 s2, 0x12d8fa
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v0
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, -1, v4, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX9-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s0
-; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
+; GFX9-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
-; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[4:5]
+; GFX9-NEXT:    global_store_dwordx2 v5, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX90A-LABEL: srem_i64_oddk_denom:
@@ -13172,7 +13121,7 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GFX90A-NEXT:    v_madak_f32 v0, 0, v0, 0x4996c7d8
 ; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX90A-NEXT:    s_mov_b32 s2, 0xffed2705
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffed2705
 ; GFX90A-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -13181,12 +13130,12 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s2
+; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s4
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s2
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v0, s4
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
 ; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, v3
@@ -13201,37 +13150,35 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v0, s2
-; GFX90A-NEXT:    v_add_u32_e32 v5, v6, v5
-; GFX90A-NEXT:    v_sub_u32_e32 v5, v5, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v7, v0, s2
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v7
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v12
-; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, v8, v11, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v6, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v11, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v7, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v6, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v0, s4
+; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX90A-NEXT:    v_sub_u32_e32 v3, v3, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, s4
+; GFX90A-NEXT:    v_mul_hi_u32 v6, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v7, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v0, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v10
+; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v8, v9, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v6, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_ashr_i32 s0, s7, 31
-; GFX90A-NEXT:    s_add_u32 s2, s6, s0
+; GFX90A-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v8, v4, vcc
+; GFX90A-NEXT:    s_add_u32 s2, s2, s4
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX90A-NEXT:    s_mov_b32 s1, s0
-; GFX90A-NEXT:    s_addc_u32 s3, s7, s0
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX90A-NEXT:    s_mov_b32 s5, s4
+; GFX90A-NEXT:    s_addc_u32 s3, s3, s4
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX90A-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s2, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
 ; GFX90A-NEXT:    v_mul_hi_u32 v3, s2, v1
@@ -13246,39 +13193,39 @@ define amdgpu_kernel void @srem_i64_oddk_denom(i64 addrspace(1)* %out, i64 %x) {
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s3, v1
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v8, v3, vcc
-; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fb
-; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s1
-; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s1
+; GFX90A-NEXT:    s_mov_b32 s5, 0x12d8fb
+; GFX90A-NEXT:    v_mul_lo_u32 v1, v1, s5
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, s5
+; GFX90A-NEXT:    v_mul_lo_u32 v0, v0, s5
 ; GFX90A-NEXT:    v_add_u32_e32 v1, v3, v1
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v0
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v3, vcc, s1, v0
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v3, vcc, s5, v0
 ; GFX90A-NEXT:    v_subbrev_co_u32_e32 v4, vcc, 0, v1, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s1, v3
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v5, vcc, s5, v3
 ; GFX90A-NEXT:    v_subbrev_co_u32_e32 v6, vcc, 0, v4, vcc
-; GFX90A-NEXT:    s_mov_b32 s1, 0x12d8fa
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v3
+; GFX90A-NEXT:    s_mov_b32 s2, 0x12d8fa
+; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v7, -1, v7, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s1, v0
+; GFX90A-NEXT:    v_cmp_lt_u32_e32 vcc, s2, v0
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v0, s0, v0
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v3, s0
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
+; GFX90A-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX90A-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s4
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s4, v0
 ; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
-; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX90A-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX90A-NEXT:    s_endpgm
   %r = srem i64 %x, 1235195
   store i64 %r, i64 addrspace(1)* %out
@@ -13358,7 +13305,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-LABEL: srem_i64_pow2_shl_denom:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_load_dword s4, s[0:1], 0xd
-; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
 ; GFX6-NEXT:    s_mov_b64 s[2:3], 0x1000
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX6-NEXT:    s_mov_b32 s6, -1
@@ -13368,27 +13314,29 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    s_add_u32 s2, s2, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_addc_u32 s3, s3, s4
-; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GFX6-NEXT:    s_sub_u32 s2, 0, s12
-; GFX6-NEXT:    s_subb_u32 s3, 0, s13
-; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
+; GFX6-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GFX6-NEXT:    s_sub_u32 s4, 0, s8
+; GFX6-NEXT:    s_subb_u32 s5, 0, s9
+; GFX6-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s15, s14
-; GFX6-NEXT:    s_mov_b32 s4, s8
-; GFX6-NEXT:    s_mov_b32 s5, s9
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_ashr_i32 s10, s3, 31
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v5, s3, v0
-; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v0
+; GFX6-NEXT:    s_add_u32 s2, s2, s10
+; GFX6-NEXT:    s_mov_b32 s11, s10
+; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GFX6-NEXT:    v_mul_lo_u32 v4, s4, v0
+; GFX6-NEXT:    s_addc_u32 s3, s3, s10
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v0, v4
@@ -13400,6 +13348,7 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v1, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX6-NEXT:    s_xor_b64 s[12:13], s[2:3], s[10:11]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
 ; GFX6-NEXT:    v_mov_b32_e32 v4, 0
@@ -13407,88 +13356,85 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX6-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
-; GFX6-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GFX6-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v0
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GFX6-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GFX6-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GFX6-NEXT:    v_mul_hi_u32 v9, v2, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, v2, v7
-; GFX6-NEXT:    v_mul_hi_u32 v8, v2, v5
-; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v2, v2, v5
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v8, v4, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v5, vcc
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_addc_u32_e64 v1, vcc, v1, v5, s[0:1]
-; GFX6-NEXT:    s_add_u32 s0, s10, s14
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GFX6-NEXT:    s_addc_u32 s1, s11, s14
-; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
-; GFX6-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GFX6-NEXT:    v_mul_hi_u32 v5, s10, v1
-; GFX6-NEXT:    v_mul_hi_u32 v7, s11, v1
-; GFX6-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s4, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s4, v0
+; GFX6-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GFX6-NEXT:    s_mov_b32 s5, s1
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_mul_lo_u32 v3, s4, v0
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; GFX6-NEXT:    v_mul_lo_u32 v8, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX6-NEXT:    v_mul_hi_u32 v10, v0, v2
+; GFX6-NEXT:    v_mul_hi_u32 v7, v1, v3
+; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v5, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v8, v3
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v9, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v4, vcc
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v6, v5, vcc
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v2, s12, v1
+; GFX6-NEXT:    v_mul_hi_u32 v3, s12, v0
+; GFX6-NEXT:    v_mul_hi_u32 v5, s12, v1
+; GFX6-NEXT:    v_mul_hi_u32 v7, s13, v1
+; GFX6-NEXT:    v_mul_lo_u32 v1, s13, v1
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, s11, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GFX6-NEXT:    v_mul_lo_u32 v5, s13, v0
+; GFX6-NEXT:    v_mul_hi_u32 v0, s13, v0
+; GFX6-NEXT:    s_mov_b32 s4, s0
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v7, v4, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_addc_u32_e32 v1, vcc, v6, v2, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v1, s12, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_mul_lo_u32 v3, s13, v0
-; GFX6-NEXT:    v_mul_lo_u32 v0, s12, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, s8, v1
+; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
+; GFX6-NEXT:    v_mul_lo_u32 v3, s9, v0
+; GFX6-NEXT:    v_mul_lo_u32 v0, s8, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s11, v1
-; GFX6-NEXT:    v_mov_b32_e32 v3, s13
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s10, v0
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s13, v1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s9
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s12, v0
 ; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, vcc
-; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s12, v0
+; GFX6-NEXT:    v_subrev_i32_e64 v4, s[0:1], s8, v0
 ; GFX6-NEXT:    v_subbrev_u32_e64 v5, s[2:3], 0, v2, s[0:1]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v5
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s9, v5
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
-; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v4
+; GFX6-NEXT:    v_cmp_le_u32_e64 s[2:3], s8, v4
 ; GFX6-NEXT:    v_subb_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
-; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v5
-; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s12, v4
+; GFX6-NEXT:    v_cmp_eq_u32_e64 s[2:3], s9, v5
+; GFX6-NEXT:    v_subrev_i32_e64 v3, s[0:1], s8, v4
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
 ; GFX6-NEXT:    v_subbrev_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v5, v2, s[0:1]
-; GFX6-NEXT:    v_mov_b32_e32 v5, s11
+; GFX6-NEXT:    v_mov_b32_e32 v5, s13
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
 ; GFX6-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
+; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e64 v2, v4, v3, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_xor_b32_e32 v0, s14, v0
-; GFX6-NEXT:    v_xor_b32_e32 v1, s14, v1
-; GFX6-NEXT:    v_mov_b32_e32 v2, s14
-; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s14, v0
+; GFX6-NEXT:    v_xor_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_xor_b32_e32 v1, s10, v1
+; GFX6-NEXT:    v_mov_b32_e32 v2, s10
+; GFX6-NEXT:    v_subrev_i32_e32 v0, vcc, s10, v0
 ; GFX6-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GFX6-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX6-NEXT:    s_endpgm
@@ -13507,20 +13453,26 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX9-NEXT:    s_sub_u32 s10, 0, s8
-; GFX9-NEXT:    s_subb_u32 s4, 0, s9
+; GFX9-NEXT:    s_sub_u32 s2, 0, s8
+; GFX9-NEXT:    s_subb_u32 s3, 0, s9
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v3, s10, v1
-; GFX9-NEXT:    v_mul_hi_u32 v4, s10, v0
-; GFX9-NEXT:    v_mul_lo_u32 v6, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v0
+; GFX9-NEXT:    s_add_u32 s0, s6, s10
+; GFX9-NEXT:    s_mov_b32 s11, s10
+; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v6, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s2, v0
+; GFX9-NEXT:    s_addc_u32 s1, s7, s10
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v0, v5
@@ -13538,39 +13490,31 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v0, s[2:3], v0, v3
-; GFX9-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v5, s10, v3
-; GFX9-NEXT:    v_mul_hi_u32 v7, s10, v0
-; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v0
-; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v0
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX9-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v8
-; GFX9-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v11, v0, v9
-; GFX9-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GFX9-NEXT:    v_mul_hi_u32 v8, v3, v9
-; GFX9-NEXT:    v_mul_lo_u32 v9, v3, v9
-; GFX9-NEXT:    v_mul_hi_u32 v7, v3, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v2, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v5, vcc
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX9-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[2:3]
-; GFX9-NEXT:    s_add_u32 s0, s6, s10
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
-; GFX9-NEXT:    s_mov_b32 s11, s10
-; GFX9-NEXT:    s_addc_u32 s1, s7, s10
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, s2, v1
+; GFX9-NEXT:    v_mul_hi_u32 v4, s2, v0
+; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v0
+; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v0
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
+; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX9-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v9, v0, v7
+; GFX9-NEXT:    v_mul_hi_u32 v10, v0, v3
+; GFX9-NEXT:    v_mul_hi_u32 v5, v1, v7
+; GFX9-NEXT:    v_mul_lo_u32 v7, v1, v7
+; GFX9-NEXT:    v_mul_hi_u32 v4, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v4, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v3, s6, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v4, s6, v0
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s6, v1
@@ -13642,25 +13586,25 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GFX90A-NEXT:    s_sub_u32 s2, 0, s8
-; GFX90A-NEXT:    s_subb_u32 s3, 0, s9
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
+; GFX90A-NEXT:    s_sub_u32 s0, 0, s8
+; GFX90A-NEXT:    s_subb_u32 s1, 0, s9
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
 ; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX90A-NEXT:    s_mov_b32 s11, s10
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s2, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s1, v0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s0, v0
 ; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
@@ -13677,34 +13621,32 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(i64 addrspace(1)* %out, i64 %
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v0, s[0:1], v0, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v1, v4, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s3, v0
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v0
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v0, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v0, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v0, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v2, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v4
-; GFX90A-NEXT:    v_addc_co_u32_e64 v1, vcc, v1, v5, s[0:1]
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s0, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v4, s1, v0
+; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v0, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v0, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v10
+; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v7, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v2, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, v1, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v4, vcc
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
 ; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v3
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
 ; GFX90A-NEXT:    v_mul_lo_u32 v4, s6, v1
 ; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v0
@@ -13900,12 +13842,14 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v0, s16
 ; GFX6-NEXT:    v_cvt_f32_u32_e32 v1, s17
 ; GFX6-NEXT:    s_mov_b32 s21, 0xcf800000
-; GFX6-NEXT:    s_sub_u32 s6, 0, s16
-; GFX6-NEXT:    s_subb_u32 s7, 0, s17
+; GFX6-NEXT:    s_sub_u32 s2, 0, s16
+; GFX6-NEXT:    s_subb_u32 s3, 0, s17
 ; GFX6-NEXT:    v_mac_f32_e32 v0, s18, v1
 ; GFX6-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX6-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; GFX6-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
+; GFX6-NEXT:    s_mov_b32 s7, 0xf000
+; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_mul_f32_e32 v0, s19, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, s20, v0
 ; GFX6-NEXT:    v_trunc_f32_e32 v1, v1
@@ -13915,10 +13859,10 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX6-NEXT:    s_add_u32 s0, s8, s12
-; GFX6-NEXT:    v_mul_lo_u32 v0, s6, v2
-; GFX6-NEXT:    v_mul_hi_u32 v1, s6, v3
-; GFX6-NEXT:    v_mul_lo_u32 v4, s7, v3
-; GFX6-NEXT:    v_mul_lo_u32 v5, s6, v3
+; GFX6-NEXT:    v_mul_lo_u32 v0, s2, v2
+; GFX6-NEXT:    v_mul_hi_u32 v1, s2, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, s3, v3
+; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v3
 ; GFX6-NEXT:    s_mov_b32 s13, s12
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v0, v4
@@ -13940,33 +13884,30 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v6, vcc
-; GFX6-NEXT:    v_add_i32_e64 v3, s[2:3], v3, v4
-; GFX6-NEXT:    v_addc_u32_e64 v4, vcc, v2, v5, s[2:3]
-; GFX6-NEXT:    v_mul_lo_u32 v6, s6, v4
-; GFX6-NEXT:    v_mul_hi_u32 v7, s6, v3
-; GFX6-NEXT:    v_mul_lo_u32 v8, s7, v3
-; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GFX6-NEXT:    v_mul_lo_u32 v7, s6, v3
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GFX6-NEXT:    v_mul_lo_u32 v10, v3, v6
-; GFX6-NEXT:    v_mul_hi_u32 v11, v3, v7
-; GFX6-NEXT:    v_mul_hi_u32 v12, v3, v6
-; GFX6-NEXT:    v_mul_hi_u32 v9, v4, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GFX6-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v8, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v1, v6, vcc
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; GFX6-NEXT:    v_addc_u32_e64 v2, vcc, v2, v6, s[2:3]
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v2, v5, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, s2, v2
+; GFX6-NEXT:    v_mul_hi_u32 v5, s2, v3
+; GFX6-NEXT:    v_mul_lo_u32 v6, s3, v3
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_mul_lo_u32 v5, s2, v3
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GFX6-NEXT:    v_mul_lo_u32 v8, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v9, v3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v10, v3, v4
+; GFX6-NEXT:    v_mul_hi_u32 v7, v2, v5
+; GFX6-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GFX6-NEXT:    v_mul_hi_u32 v6, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, 0, v10, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v4, v2, v4
+; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v6, vcc, v6, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v1, v6, vcc
+; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GFX6-NEXT:    v_addc_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v4, s8, v2
 ; GFX6-NEXT:    v_mul_hi_u32 v5, s8, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v6, s8, v2
@@ -13976,7 +13917,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s9, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s9, v3
-; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_addc_u32_e32 v3, vcc, v5, v3, vcc
 ; GFX6-NEXT:    v_addc_u32_e32 v4, vcc, v7, v0, vcc
@@ -14031,15 +13971,15 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_mac_f32_e32 v5, s21, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX6-NEXT:    s_sub_u32 s2, 0, s8
+; GFX6-NEXT:    s_sub_u32 s0, 0, s8
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_mul_hi_u32 v4, s2, v5
-; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v6
-; GFX6-NEXT:    s_subb_u32 s3, 0, s9
-; GFX6-NEXT:    v_mul_lo_u32 v8, s3, v5
+; GFX6-NEXT:    v_mul_hi_u32 v4, s0, v5
+; GFX6-NEXT:    v_mul_lo_u32 v7, s0, v6
+; GFX6-NEXT:    s_subb_u32 s1, 0, s9
+; GFX6-NEXT:    v_mul_lo_u32 v8, s1, v5
 ; GFX6-NEXT:    s_ashr_i32 s14, s11, 31
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GFX6-NEXT:    v_mul_lo_u32 v7, s2, v5
+; GFX6-NEXT:    v_mul_lo_u32 v7, s0, v5
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GFX6-NEXT:    v_mul_lo_u32 v8, v5, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v9, v5, v7
@@ -14057,35 +13997,33 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v11, v0, vcc
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
 ; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v1, v8, vcc
-; GFX6-NEXT:    v_add_i32_e64 v4, s[0:1], v5, v4
-; GFX6-NEXT:    v_addc_u32_e64 v5, vcc, v6, v7, s[0:1]
-; GFX6-NEXT:    v_mul_lo_u32 v8, s2, v5
-; GFX6-NEXT:    v_mul_hi_u32 v9, s2, v4
-; GFX6-NEXT:    v_mul_lo_u32 v10, s3, v4
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v6, v7, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, s0, v5
+; GFX6-NEXT:    v_mul_hi_u32 v7, s0, v4
+; GFX6-NEXT:    v_mul_lo_u32 v8, s1, v4
 ; GFX6-NEXT:    v_xor_b32_e32 v3, s12, v3
-; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GFX6-NEXT:    v_mul_lo_u32 v9, s2, v4
-; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GFX6-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GFX6-NEXT:    v_mul_hi_u32 v13, v4, v9
-; GFX6-NEXT:    v_mul_hi_u32 v14, v4, v8
-; GFX6-NEXT:    v_mul_hi_u32 v11, v5, v9
-; GFX6-NEXT:    v_mul_lo_u32 v9, v5, v9
-; GFX6-NEXT:    v_mul_hi_u32 v10, v5, v8
-; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GFX6-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v5, v5, v8
-; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
-; GFX6-NEXT:    v_addc_u32_e32 v9, vcc, v13, v11, vcc
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v10, v0, vcc
-; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v1, v8, vcc
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GFX6-NEXT:    v_addc_u32_e64 v6, vcc, v6, v8, s[0:1]
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_mul_lo_u32 v7, s0, v4
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
+; GFX6-NEXT:    v_mul_lo_u32 v10, v4, v6
+; GFX6-NEXT:    v_mul_hi_u32 v11, v4, v7
+; GFX6-NEXT:    v_mul_hi_u32 v12, v4, v6
+; GFX6-NEXT:    v_mul_hi_u32 v9, v5, v7
+; GFX6-NEXT:    v_mul_lo_u32 v7, v5, v7
+; GFX6-NEXT:    v_mul_hi_u32 v8, v5, v6
+; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GFX6-NEXT:    v_addc_u32_e32 v11, vcc, 0, v12, vcc
+; GFX6-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v8, vcc, v8, v0, vcc
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GFX6-NEXT:    v_addc_u32_e32 v7, vcc, v1, v8, vcc
 ; GFX6-NEXT:    s_add_u32 s0, s10, s14
-; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    s_addc_u32 s1, s11, s14
-; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, 0, v6, vcc
+; GFX6-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
 ; GFX6-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
 ; GFX6-NEXT:    v_mul_lo_u32 v6, s10, v5
 ; GFX6-NEXT:    v_mul_hi_u32 v7, s10, v4
@@ -14165,20 +14103,24 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s13
 ; GFX9-NEXT:    s_mov_b32 s19, 0xcf800000
-; GFX9-NEXT:    s_sub_u32 s8, 0, s12
-; GFX9-NEXT:    s_subb_u32 s4, 0, s13
+; GFX9-NEXT:    s_sub_u32 s2, 0, s12
+; GFX9-NEXT:    s_subb_u32 s3, 0, s13
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s16, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX9-NEXT:    v_mul_f32_e32 v0, s17, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, s18, v0
 ; GFX9-NEXT:    v_trunc_f32_e32 v1, v1
 ; GFX9-NEXT:    v_mac_f32_e32 v0, s19, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v0
-; GFX9-NEXT:    v_mul_lo_u32 v0, s8, v2
-; GFX9-NEXT:    v_mul_hi_u32 v1, s8, v3
-; GFX9-NEXT:    v_mul_lo_u32 v5, s4, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, s8, v3
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
+; GFX9-NEXT:    s_mov_b32 s9, s8
+; GFX9-NEXT:    v_mul_lo_u32 v0, s2, v2
+; GFX9-NEXT:    v_mul_hi_u32 v1, s2, v3
+; GFX9-NEXT:    v_mul_lo_u32 v5, s3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v0, v1, v0
 ; GFX9-NEXT:    v_add_u32_e32 v5, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v3, v4
@@ -14197,38 +14139,32 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
 ; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v6, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v3, s[2:3], v3, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v4, vcc, v2, v5, s[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v6, s8, v4
-; GFX9-NEXT:    v_mul_hi_u32 v7, s8, v3
-; GFX9-NEXT:    v_mul_lo_u32 v8, s4, v3
-; GFX9-NEXT:    v_mul_lo_u32 v9, s8, v3
-; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
-; GFX9-NEXT:    v_mul_lo_u32 v10, v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v11, v3, v9
-; GFX9-NEXT:    v_mul_hi_u32 v12, v3, v6
-; GFX9-NEXT:    v_mul_hi_u32 v8, v4, v9
-; GFX9-NEXT:    v_mul_lo_u32 v9, v4, v9
-; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v7, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v1, v6, vcc
-; GFX9-NEXT:    v_add_u32_e32 v2, v2, v5
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_ashr_i32 s8, s5, 31
-; GFX9-NEXT:    v_addc_co_u32_e64 v2, vcc, v2, v6, s[2:3]
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v2
+; GFX9-NEXT:    v_mul_hi_u32 v5, s2, v3
+; GFX9-NEXT:    v_mul_lo_u32 v6, s3, v3
+; GFX9-NEXT:    v_mul_lo_u32 v7, s2, v3
 ; GFX9-NEXT:    s_add_u32 s2, s4, s8
+; GFX9-NEXT:    v_add_u32_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_u32_e32 v4, v4, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, v3, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v3, v7
+; GFX9-NEXT:    v_mul_hi_u32 v10, v3, v4
+; GFX9-NEXT:    v_mul_hi_u32 v6, v2, v7
+; GFX9-NEXT:    v_mul_lo_u32 v7, v2, v7
+; GFX9-NEXT:    v_mul_hi_u32 v5, v2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, v2, v4
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v9, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v1, v5, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
-; GFX9-NEXT:    s_mov_b32 s9, s8
 ; GFX9-NEXT:    s_addc_u32 s3, s5, s8
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v5, vcc
 ; GFX9-NEXT:    s_xor_b64 s[14:15], s[2:3], s[8:9]
 ; GFX9-NEXT:    v_mul_lo_u32 v4, s14, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s14, v3
@@ -14294,13 +14230,13 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_mac_f32_e32 v5, s19, v6
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v5, v5
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX9-NEXT:    s_sub_u32 s2, 0, s10
-; GFX9-NEXT:    s_subb_u32 s3, 0, s11
-; GFX9-NEXT:    v_mul_hi_u32 v7, s2, v5
-; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v6
-; GFX9-NEXT:    v_mul_lo_u32 v9, s3, v5
+; GFX9-NEXT:    s_sub_u32 s0, 0, s10
+; GFX9-NEXT:    s_subb_u32 s1, 0, s11
+; GFX9-NEXT:    v_mul_hi_u32 v7, s0, v5
+; GFX9-NEXT:    v_mul_lo_u32 v8, s0, v6
+; GFX9-NEXT:    v_mul_lo_u32 v9, s1, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v4, s2, v5
+; GFX9-NEXT:    v_mul_lo_u32 v4, s0, v5
 ; GFX9-NEXT:    v_add_u32_e32 v7, v7, v8
 ; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v5, v7
@@ -14319,34 +14255,32 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v0, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v1, v8, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[0:1], v5, v4
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v6, v7, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v8, s2, v5
-; GFX9-NEXT:    v_mul_hi_u32 v9, s2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v10, s3, v4
-; GFX9-NEXT:    v_mul_lo_u32 v11, s2, v4
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
-; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
-; GFX9-NEXT:    v_add_u32_e32 v8, v8, v10
-; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v13, v4, v11
-; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v8
-; GFX9-NEXT:    v_mul_hi_u32 v10, v5, v11
-; GFX9-NEXT:    v_mul_lo_u32 v11, v5, v11
-; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v13, v12
-; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v14, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v5, v5, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
-; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v10, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v9, v0, vcc
-; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v6, v8, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, s0, v5
+; GFX9-NEXT:    v_mul_hi_u32 v7, s0, v4
+; GFX9-NEXT:    v_mul_lo_u32 v8, s1, v4
+; GFX9-NEXT:    v_mul_lo_u32 v9, s0, v4
 ; GFX9-NEXT:    s_add_u32 s0, s6, s12
-; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v5
+; GFX9-NEXT:    v_add_u32_e32 v6, v7, v6
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v9
+; GFX9-NEXT:    v_mul_hi_u32 v12, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v5, v9
+; GFX9-NEXT:    v_mul_lo_u32 v9, v5, v9
+; GFX9-NEXT:    v_mul_hi_u32 v7, v5, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v12, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v8, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v0, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v8, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v1, v7, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
 ; GFX9-NEXT:    s_addc_u32 s1, s7, s12
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[0:1], s[12:13]
 ; GFX9-NEXT:    v_mul_lo_u32 v6, s6, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v7, s6, v4
@@ -14426,222 +14360,218 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    s_mov_b32 s5, s4
 ; GFX90A-NEXT:    s_addc_u32 s3, s3, s4
 ; GFX90A-NEXT:    s_xor_b64 s[12:13], s[2:3], s[4:5]
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s12
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s13
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v1, s13
 ; GFX90A-NEXT:    s_mov_b32 s19, 0xcf800000
-; GFX90A-NEXT:    s_sub_u32 s2, 0, s12
-; GFX90A-NEXT:    s_subb_u32 s3, 0, s13
-; GFX90A-NEXT:    v_mac_f32_e32 v1, s16, v2
-; GFX90A-NEXT:    v_rcp_f32_e32 v1, v1
-; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX90A-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
 ; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s17, v1
-; GFX90A-NEXT:    v_mul_f32_e32 v2, s18, v1
-; GFX90A-NEXT:    v_trunc_f32_e32 v2, v2
-; GFX90A-NEXT:    v_mac_f32_e32 v1, s19, v2
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX90A-NEXT:    s_sub_u32 s0, 0, s12
+; GFX90A-NEXT:    v_mac_f32_e32 v0, s16, v1
+; GFX90A-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX90A-NEXT:    s_subb_u32 s1, 0, s13
+; GFX90A-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_ashr_i32 s14, s5, 31
+; GFX90A-NEXT:    v_mul_f32_e32 v0, s17, v0
+; GFX90A-NEXT:    v_mul_f32_e32 v1, s18, v0
+; GFX90A-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX90A-NEXT:    v_mac_f32_e32 v0, s19, v1
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX90A-NEXT:    s_mov_b32 s15, s14
-; GFX90A-NEXT:    v_mul_hi_u32 v4, s2, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s3, v1
-; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v5
-; GFX90A-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX90A-NEXT:    v_mul_lo_u32 v6, s2, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v1, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v4, v1, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s1, v0
+; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v6, s0, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v0, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v0, v6
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v0, v2
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v2, v6
-; GFX90A-NEXT:    v_mul_lo_u32 v6, v2, v6
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v6
+; GFX90A-NEXT:    v_mul_lo_u32 v6, v1, v6
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v6
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v4, v8, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v7, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, 0
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v4, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v3
-; GFX90A-NEXT:    v_addc_co_u32_e64 v3, vcc, v2, v4, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v1
-; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s3, v1
-; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v1, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v8, v1, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v1, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
-; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v8, v3
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v6, v7, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v4
-; GFX90A-NEXT:    v_addc_co_u32_e64 v2, vcc, v2, v5, s[0:1]
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v3, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s0, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v3, s0, v0
+; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s1, v0
+; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v0, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v0, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v10
+; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v3, v1, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v9, v7, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v5, v2
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v3, vcc
 ; GFX90A-NEXT:    s_add_u32 s0, s4, s14
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v2
 ; GFX90A-NEXT:    s_addc_u32 s1, s5, s14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], s[0:1], s[14:15]
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s4, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s4, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, v5, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v1
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v2, s4, v1
+; GFX90A-NEXT:    v_add_co_u32_e32 v3, vcc, v5, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s5, v0
+; GFX90A-NEXT:    v_mul_lo_u32 v0, s5, v0
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v3, v0
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v1
+; GFX90A-NEXT:    v_addc_co_u32_e32 v0, vcc, v2, v7, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v4, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s5, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v4, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s5, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v3, v7, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s5, v2
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
-; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v6, v3, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v2, s12, v2
-; GFX90A-NEXT:    v_mul_hi_u32 v3, s12, v1
-; GFX90A-NEXT:    v_add_u32_e32 v2, v3, v2
-; GFX90A-NEXT:    v_mul_lo_u32 v3, s13, v1
-; GFX90A-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v1
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v6, v2, vcc
 ; GFX90A-NEXT:    v_mul_lo_u32 v1, s12, v1
-; GFX90A-NEXT:    v_sub_u32_e32 v3, s5, v2
-; GFX90A-NEXT:    v_mov_b32_e32 v4, s13
-; GFX90A-NEXT:    v_sub_co_u32_e32 v1, vcc, s4, v1
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v1
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v3, s[0:1]
+; GFX90A-NEXT:    v_mul_hi_u32 v2, s12, v0
+; GFX90A-NEXT:    v_add_u32_e32 v1, v2, v1
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s13, v0
+; GFX90A-NEXT:    v_add_u32_e32 v1, v1, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v0, s12, v0
+; GFX90A-NEXT:    v_sub_u32_e32 v2, s5, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v3, s13
+; GFX90A-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX90A-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, vcc
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v0
+; GFX90A-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v2, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s13, v7
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v5
-; GFX90A-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
+; GFX90A-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v3, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
 ; GFX90A-NEXT:    v_cmp_eq_u32_e64 s[2:3], s13, v7
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s12, v5
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v5
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
-; GFX90A-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
+; GFX90A-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
-; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[0:1]
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v5, s5
-; GFX90A-NEXT:    v_subb_co_u32_e32 v2, vcc, v5, v2, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v7, v3, s[0:1]
+; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v1, vcc
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s13, v1
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v2, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v1
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
 ; GFX90A-NEXT:    s_ashr_i32 s0, s11, 31
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v1
 ; GFX90A-NEXT:    s_add_u32 s2, s10, s0
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
 ; GFX90A-NEXT:    s_mov_b32 s1, s0
 ; GFX90A-NEXT:    s_addc_u32 s3, s11, s0
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
 ; GFX90A-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s4
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v4, s5
+; GFX90A-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v3, s5
+; GFX90A-NEXT:    v_xor_b32_e32 v0, s14, v0
+; GFX90A-NEXT:    s_sub_u32 s0, 0, s4
 ; GFX90A-NEXT:    v_xor_b32_e32 v1, s14, v1
-; GFX90A-NEXT:    v_xor_b32_e32 v5, s14, v2
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s14, v1
-; GFX90A-NEXT:    v_mac_f32_e32 v3, s16, v4
-; GFX90A-NEXT:    v_rcp_f32_e32 v4, v3
-; GFX90A-NEXT:    v_mov_b32_e32 v7, s14
-; GFX90A-NEXT:    s_sub_u32 s2, 0, s4
-; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v7, vcc
-; GFX90A-NEXT:    v_mul_f32_e32 v1, s17, v4
-; GFX90A-NEXT:    v_mul_f32_e32 v4, s18, v1
-; GFX90A-NEXT:    v_trunc_f32_e32 v4, v4
-; GFX90A-NEXT:    v_mac_f32_e32 v1, s19, v4
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX90A-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GFX90A-NEXT:    s_subb_u32 s3, 0, s5
-; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
-; GFX90A-NEXT:    v_mul_hi_u32 v7, s2, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s3, v1
+; GFX90A-NEXT:    v_mac_f32_e32 v2, s16, v3
+; GFX90A-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s14
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v0, vcc, s14, v0
+; GFX90A-NEXT:    v_mul_f32_e32 v2, s17, v2
+; GFX90A-NEXT:    v_mul_f32_e32 v3, s18, v2
+; GFX90A-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX90A-NEXT:    v_mac_f32_e32 v2, s19, v3
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX90A-NEXT:    s_subb_u32 s1, 0, s5
+; GFX90A-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s0, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s0, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s1, v2
 ; GFX90A-NEXT:    v_add_u32_e32 v7, v7, v8
 ; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s2, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v8, v1, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v1, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v7, v1, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v9, s0, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v8, v2, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v2, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v2, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v10, v8
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v4, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v9, v4, v9
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v3, v9
+; GFX90A-NEXT:    v_mul_lo_u32 v9, v3, v9
 ; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v4, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v10, v3, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
 ; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v7, v5
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v6, v8, vcc
-; GFX90A-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v5
-; GFX90A-NEXT:    v_addc_co_u32_e64 v5, vcc, v4, v7, s[0:1]
-; GFX90A-NEXT:    v_mul_lo_u32 v8, s2, v5
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s2, v1
-; GFX90A-NEXT:    v_add_u32_e32 v8, v9, v8
-; GFX90A-NEXT:    v_mul_lo_u32 v9, s3, v1
-; GFX90A-NEXT:    v_add_u32_e32 v8, v8, v9
-; GFX90A-NEXT:    v_mul_lo_u32 v10, s2, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v11, v5, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v12, v5, v10
-; GFX90A-NEXT:    v_mul_lo_u32 v14, v1, v8
-; GFX90A-NEXT:    v_mul_hi_u32 v10, v1, v10
-; GFX90A-NEXT:    v_mul_hi_u32 v13, v1, v8
-; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v14
-; GFX90A-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
-; GFX90A-NEXT:    v_add_co_u32_e32 v10, vcc, v10, v12
-; GFX90A-NEXT:    v_mul_hi_u32 v9, v5, v8
-; GFX90A-NEXT:    v_addc_co_u32_e32 v10, vcc, v13, v11, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v5, v5, v8
-; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v10, v5
-; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v6, v9, vcc
-; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v7
-; GFX90A-NEXT:    v_addc_co_u32_e64 v4, vcc, v4, v8, s[0:1]
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s0, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v7, s0, v2
+; GFX90A-NEXT:    v_add_u32_e32 v5, v7, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s1, v2
+; GFX90A-NEXT:    v_add_u32_e32 v5, v5, v7
+; GFX90A-NEXT:    v_mul_lo_u32 v8, s0, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v9, v3, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v10, v3, v8
+; GFX90A-NEXT:    v_mul_lo_u32 v12, v2, v5
+; GFX90A-NEXT:    v_mul_hi_u32 v8, v2, v8
+; GFX90A-NEXT:    v_mul_hi_u32 v11, v2, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v12
+; GFX90A-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v10
+; GFX90A-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GFX90A-NEXT:    v_addc_co_u32_e32 v8, vcc, v11, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v5, vcc, v8, v5
+; GFX90A-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX90A-NEXT:    v_addc_co_u32_e32 v7, vcc, v6, v7, vcc
 ; GFX90A-NEXT:    s_add_u32 s0, s6, s10
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v5
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v5
 ; GFX90A-NEXT:    s_mov_b32 s11, s10
 ; GFX90A-NEXT:    s_addc_u32 s1, s7, s10
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX90A-NEXT:    s_xor_b64 s[6:7], s[0:1], s[10:11]
-; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v4
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v4
+; GFX90A-NEXT:    v_mul_lo_u32 v7, s6, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s6, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s6, v3
 ; GFX90A-NEXT:    v_add_co_u32_e32 v7, vcc, v8, v7
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v1
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s7, v1
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v7, v1
-; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, v5, v9, vcc
-; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v0, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s7, v4
-; GFX90A-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v4
-; GFX90A-NEXT:    v_addc_co_u32_e32 v4, vcc, v6, v5, vcc
-; GFX90A-NEXT:    v_mul_lo_u32 v4, s4, v4
-; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v1
-; GFX90A-NEXT:    v_add_u32_e32 v4, v5, v4
-; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v1
-; GFX90A-NEXT:    v_add_u32_e32 v4, v4, v5
-; GFX90A-NEXT:    v_mul_lo_u32 v1, s4, v1
-; GFX90A-NEXT:    v_sub_u32_e32 v5, s7, v4
+; GFX90A-NEXT:    v_mul_hi_u32 v9, s7, v2
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s7, v2
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v7, v2
+; GFX90A-NEXT:    v_mul_hi_u32 v8, s7, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v9, vcc
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, v8, v4, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s7, v3
+; GFX90A-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v3
+; GFX90A-NEXT:    v_addc_co_u32_e32 v3, vcc, v6, v5, vcc
+; GFX90A-NEXT:    v_mul_lo_u32 v3, s4, v3
+; GFX90A-NEXT:    v_mul_hi_u32 v5, s4, v2
+; GFX90A-NEXT:    v_add_u32_e32 v3, v5, v3
+; GFX90A-NEXT:    v_mul_lo_u32 v5, s5, v2
+; GFX90A-NEXT:    v_add_u32_e32 v3, v3, v5
+; GFX90A-NEXT:    v_mul_lo_u32 v2, s4, v2
+; GFX90A-NEXT:    v_sub_u32_e32 v5, s7, v3
 ; GFX90A-NEXT:    v_mov_b32_e32 v6, s5
-; GFX90A-NEXT:    v_sub_co_u32_e32 v1, vcc, s6, v1
+; GFX90A-NEXT:    v_sub_co_u32_e32 v2, vcc, s6, v2
 ; GFX90A-NEXT:    v_subb_co_u32_e64 v5, s[0:1], v5, v6, vcc
-; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s4, v1
+; GFX90A-NEXT:    v_subrev_co_u32_e64 v7, s[0:1], s4, v2
 ; GFX90A-NEXT:    v_subbrev_co_u32_e64 v8, s[2:3], 0, v5, s[0:1]
 ; GFX90A-NEXT:    v_cmp_le_u32_e64 s[2:3], s5, v8
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
@@ -14655,23 +14585,23 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(<2 x i64> addrspace(1)* %ou
 ; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v6, v7, v6, s[0:1]
 ; GFX90A-NEXT:    v_mov_b32_e32 v7, s7
-; GFX90A-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v4, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v4
+; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v7, v3, vcc
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
+; GFX90A-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v4
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, s5, v3
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v7, v7, v8, vcc
 ; GFX90A-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GFX90A-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX90A-NEXT:    v_xor_b32_e32 v1, s10, v1
-; GFX90A-NEXT:    v_xor_b32_e32 v5, s10, v4
-; GFX90A-NEXT:    v_mov_b32_e32 v6, s10
-; GFX90A-NEXT:    v_subrev_co_u32_e32 v4, vcc, s10, v1
-; GFX90A-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
-; GFX90A-NEXT:    global_store_dwordx4 v0, v[2:5], s[8:9]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX90A-NEXT:    v_xor_b32_e32 v2, s10, v2
+; GFX90A-NEXT:    v_xor_b32_e32 v3, s10, v3
+; GFX90A-NEXT:    v_mov_b32_e32 v5, s10
+; GFX90A-NEXT:    v_subrev_co_u32_e32 v2, vcc, s10, v2
+; GFX90A-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX90A-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
 ; GFX90A-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = srem <2 x i64> %x, %shl.y

diff  --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 880dcbe8296f..51c0b76b91ec 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -54,31 +54,29 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v14, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[4:5], v5, v9
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v8, v5
-; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v5
+; GFX9-NEXT:    v_mul_hi_u32 v10, v7, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v5
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
-; GFX9-NEXT:    v_add3_u32 v8, v12, v11, v8
-; GFX9-NEXT:    v_mul_lo_u32 v13, v5, v8
-; GFX9-NEXT:    v_mul_hi_u32 v16, v5, v7
-; GFX9-NEXT:    v_mul_hi_u32 v17, v5, v8
-; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
-; GFX9-NEXT:    v_mul_hi_u32 v11, v9, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v16, v13
-; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v17, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, v9, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v13, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v16, v12, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v11, v14, vcc
+; GFX9-NEXT:    v_add3_u32 v8, v10, v9, v8
+; GFX9-NEXT:    v_mul_lo_u32 v11, v5, v8
+; GFX9-NEXT:    v_mul_hi_u32 v12, v5, v7
+; GFX9-NEXT:    v_mul_hi_u32 v13, v5, v8
+; GFX9-NEXT:    v_mul_hi_u32 v10, v6, v7
+; GFX9-NEXT:    v_mul_lo_u32 v7, v6, v7
+; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
+; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v15, v13, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v11, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v12, v10, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v14, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v15, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v8, vcc
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v7
@@ -217,31 +215,29 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v12, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v13, v10, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
-; GFX9-NEXT:    v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v9, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v9
-; GFX9-NEXT:    v_add3_u32 v7, v11, v10, v7
+; GFX9-NEXT:    v_add3_u32 v7, v9, v8, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v15, v4, v7
-; GFX9-NEXT:    v_mul_hi_u32 v14, v8, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
+; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v7
+; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v5, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_mul_hi_u32 v11, v8, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v13, v15, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v13, v14, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v5, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v11, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v14, v12, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v11, v9, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v5
@@ -375,31 +371,29 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v13, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v14, v10, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
-; GFX9-NEXT:    v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v9, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v9
-; GFX9-NEXT:    v_add3_u32 v7, v11, v10, v7
-; GFX9-NEXT:    v_mul_lo_u32 v12, v4, v7
-; GFX9-NEXT:    v_mul_hi_u32 v15, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v16, v4, v7
-; GFX9-NEXT:    v_mul_hi_u32 v11, v8, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GFX9-NEXT:    v_mul_hi_u32 v10, v8, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v15, v12
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v14, v16, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v12, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v11, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v10, v13, vcc
+; GFX9-NEXT:    v_add3_u32 v7, v9, v8, v7
+; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v7
+; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v6
+; GFX9-NEXT:    v_mul_hi_u32 v12, v4, v7
+; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v5, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v14, v12, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v5, v7
+; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v11, v9, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v13, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v14, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v6
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v6
@@ -534,31 +528,29 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v12, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v13, v10, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
-; GFX9-NEXT:    v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v9, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v9
-; GFX9-NEXT:    v_add3_u32 v7, v11, v10, v7
+; GFX9-NEXT:    v_add3_u32 v7, v9, v8, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v15, v4, v7
-; GFX9-NEXT:    v_mul_hi_u32 v14, v8, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
+; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v7
+; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v5, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_mul_hi_u32 v11, v8, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v13, v15, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v13, v14, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v5, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v11, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v14, v12, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v11, v9, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v5
@@ -815,31 +807,29 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v16, v14, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v9, vcc, v10, v9
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v11, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[4:5], v5, v9
-; GFX9-NEXT:    v_addc_co_u32_e64 v9, vcc, v6, v10, s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v11, v7, v9
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v10, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v8, v5
-; GFX9-NEXT:    v_mul_hi_u32 v12, v7, v5
+; GFX9-NEXT:    v_mul_hi_u32 v10, v7, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v5
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v10
-; GFX9-NEXT:    v_add3_u32 v8, v12, v11, v8
-; GFX9-NEXT:    v_mul_lo_u32 v13, v5, v8
-; GFX9-NEXT:    v_mul_hi_u32 v16, v5, v7
-; GFX9-NEXT:    v_mul_hi_u32 v17, v5, v8
-; GFX9-NEXT:    v_mul_hi_u32 v12, v9, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
-; GFX9-NEXT:    v_mul_hi_u32 v11, v9, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v13, vcc, v16, v13
-; GFX9-NEXT:    v_addc_co_u32_e32 v16, vcc, v15, v17, vcc
-; GFX9-NEXT:    v_mul_lo_u32 v8, v9, v8
-; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v13, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v16, v12, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v11, v14, vcc
+; GFX9-NEXT:    v_add3_u32 v8, v10, v9, v8
+; GFX9-NEXT:    v_mul_lo_u32 v11, v5, v8
+; GFX9-NEXT:    v_mul_hi_u32 v12, v5, v7
+; GFX9-NEXT:    v_mul_hi_u32 v13, v5, v8
+; GFX9-NEXT:    v_mul_hi_u32 v10, v6, v7
+; GFX9-NEXT:    v_mul_lo_u32 v7, v6, v7
+; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v12, v11
+; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v15, v13, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v11, v7
+; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v12, v10, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v14, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v15, v9, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, vcc, v6, v8, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v7
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v8, vcc
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v7
 ; GFX9-NEXT:    v_xor_b32_e32 v0, v0, v7
@@ -998,31 +988,29 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v12, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v9, v8
 ; GFX9-NEXT:    v_addc_co_u32_e32 v9, vcc, v13, v10, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v4, s[4:5], v4, v8
-; GFX9-NEXT:    v_addc_co_u32_e64 v8, vcc, v5, v9, s[4:5]
-; GFX9-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v8
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v9, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v4
-; GFX9-NEXT:    v_mul_hi_u32 v11, v6, v4
+; GFX9-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, v4
-; GFX9-NEXT:    v_add_u32_e32 v5, v5, v9
-; GFX9-NEXT:    v_add3_u32 v7, v11, v10, v7
+; GFX9-NEXT:    v_add3_u32 v7, v9, v8, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v10, v4, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v4, v6
-; GFX9-NEXT:    v_mul_hi_u32 v15, v4, v7
-; GFX9-NEXT:    v_mul_hi_u32 v14, v8, v7
-; GFX9-NEXT:    v_mul_lo_u32 v7, v8, v7
+; GFX9-NEXT:    v_mul_hi_u32 v14, v4, v7
+; GFX9-NEXT:    v_mul_hi_u32 v9, v5, v6
+; GFX9-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GFX9-NEXT:    v_mul_hi_u32 v8, v5, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v10, vcc, v11, v10
-; GFX9-NEXT:    v_mul_hi_u32 v11, v8, v6
-; GFX9-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v15, vcc, v13, v15, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, v13, v14, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v7, v5, v7
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v10, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v11, vcc
-; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v14, v12, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v11, v9, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v12, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v6, vcc, v6, v7
 ; GFX9-NEXT:    v_addc_co_u32_e32 v7, vcc, v13, v8, vcc
-; GFX9-NEXT:    v_addc_co_u32_e64 v5, vcc, v5, v7, s[4:5]
 ; GFX9-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v6
-; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX9-NEXT:    v_addc_co_u32_e32 v5, vcc, v5, v7, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v0, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v0, v5

diff  --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 263e10a51df5..578faa4a6ea0 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -372,9 +372,9 @@ define amdgpu_kernel void @vusubo64(i64 addrspace(1)* %out, i1 addrspace(1)* %ca
 ; GCN-ISEL-LABEL: body:
 ; GCN-ISEL-LABEL: bb.3
 ; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_ADD_CO_U32_e64
-; GCN-ISEL: S_ADD_CO_PSEUDO %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]]
+; GCN-ISEL: S_ADD_CO_PSEUDO %{{[0-9]+}}, killed %{{[0-9]+}}, killed %[[CARRY]]
 ; GCN-ISEL: %[[CARRY:[0-9]+]]:sreg_64_xexec = V_SUB_CO_U32_e64
-; GCN-ISEL: S_SUB_CO_PSEUDO %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]]
+; GCN-ISEL: S_SUB_CO_PSEUDO killed %{{[0-9]+}}, %{{[0-9]+}}, %[[CARRY]]
 define amdgpu_kernel void @sudiv64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out

diff  --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 639074501113..dc03778da7ac 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -5,36 +5,39 @@
 define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0xd
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
 ; GCN-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s2, s5, 31
-; GCN-NEXT:    s_add_u32 s4, s4, s2
-; GCN-NEXT:    s_mov_b32 s3, s2
-; GCN-NEXT:    s_addc_u32 s5, s5, s2
-; GCN-NEXT:    s_xor_b64 s[12:13], s[4:5], s[2:3]
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GCN-NEXT:    s_sub_u32 s4, 0, s12
-; GCN-NEXT:    s_subb_u32 s5, 0, s13
-; GCN-NEXT:    s_ashr_i32 s14, s11, 31
+; GCN-NEXT:    s_ashr_i32 s8, s3, 31
+; GCN-NEXT:    s_add_u32 s2, s2, s8
+; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_addc_u32 s3, s3, s8
+; GCN-NEXT:    s_xor_b64 s[10:11], s[2:3], s[8:9]
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GCN-NEXT:    s_sub_u32 s4, 0, s10
+; GCN-NEXT:    s_subb_u32 s5, 0, s11
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b32 s15, s14
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_ashr_i32 s12, s3, 31
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT:    s_add_u32 s2, s2, s12
+; GCN-NEXT:    s_mov_b32 s13, s12
 ; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
 ; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
 ; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
 ; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
+; GCN-NEXT:    s_addc_u32 s3, s3, s12
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
@@ -46,73 +49,69 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v7, v8, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
 ; GCN-NEXT:    v_mul_hi_u32 v5, v2, v5
+; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[12:13]
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v6, v5, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v3
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
-; GCN-NEXT:    s_mov_b32 s5, s9
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v3, v6
-; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v8, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v7, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[0:1]
-; GCN-NEXT:    s_add_u32 s0, s10, s14
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
-; GCN-NEXT:    s_addc_u32 s1, s11, s14
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
-; GCN-NEXT:    s_xor_b64 s[10:11], s[0:1], s[14:15]
-; GCN-NEXT:    v_mul_lo_u32 v3, s10, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v2
-; GCN-NEXT:    v_mul_hi_u32 v6, s11, v2
-; GCN-NEXT:    v_mul_lo_u32 v2, s11, v2
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v7, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, s2, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v5, s2, v2
+; GCN-NEXT:    v_mul_hi_u32 v6, s3, v2
+; GCN-NEXT:    v_mul_lo_u32 v2, s3, v2
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mov_b32 s4, s8
+; GCN-NEXT:    v_mul_lo_u32 v5, s3, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s12, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s12, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s13, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s13
+; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s11
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, s12, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s10, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s12, v3
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s10, v3
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v4
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s11, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v5
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v5
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s11, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
@@ -120,18 +119,18 @@ define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v6, s11
+; GCN-NEXT:    v_mov_b32_e32 v6, s3
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s13, v2
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s11, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s12, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s10, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s13, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s11, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    s_xor_b64 s[0:1], s[14:15], s[2:3]
+; GCN-NEXT:    s_xor_b64 s[0:1], s[12:13], s[8:9]
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GCN-NEXT:    v_xor_b32_e32 v0, s0, v0
 ; GCN-NEXT:    v_xor_b32_e32 v1, s1, v1
@@ -291,32 +290,30 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v16, v14, vcc
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v15, v11, vcc
-; GCN-NEXT:    v_add_i32_e64 v5, s[4:5], v5, v9
-; GCN-NEXT:    v_addc_u32_e64 v9, vcc, v6, v10, s[4:5]
-; GCN-NEXT:    v_mul_lo_u32 v11, v7, v9
-; GCN-NEXT:    v_mul_hi_u32 v12, v7, v5
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v9, v7, v6
+; GCN-NEXT:    v_mul_hi_u32 v10, v7, v5
 ; GCN-NEXT:    v_mul_lo_u32 v8, v8, v5
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v5
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_mul_lo_u32 v11, v5, v8
+; GCN-NEXT:    v_mul_hi_u32 v12, v5, v7
+; GCN-NEXT:    v_mul_hi_u32 v13, v5, v8
+; GCN-NEXT:    v_mul_hi_u32 v10, v6, v7
+; GCN-NEXT:    v_mul_lo_u32 v7, v6, v7
+; GCN-NEXT:    v_mul_hi_u32 v9, v6, v8
 ; GCN-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, v11, v8
-; GCN-NEXT:    v_mul_lo_u32 v13, v5, v8
-; GCN-NEXT:    v_mul_hi_u32 v16, v5, v7
-; GCN-NEXT:    v_mul_hi_u32 v17, v5, v8
-; GCN-NEXT:    v_mul_hi_u32 v12, v9, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v9, v7
-; GCN-NEXT:    v_mul_hi_u32 v11, v9, v8
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
-; GCN-NEXT:    v_addc_u32_e32 v16, vcc, v15, v17, vcc
-; GCN-NEXT:    v_mul_lo_u32 v8, v9, v8
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v16, v12, vcc
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v11, v14, vcc
+; GCN-NEXT:    v_addc_u32_e32 v12, vcc, v15, v13, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v6, v8
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v12, v10, vcc
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v9, v14, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v15, v9, vcc
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GCN-NEXT:    v_addc_u32_e64 v6, vcc, v6, v8, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, 0, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
 ; GCN-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v7
@@ -1115,19 +1112,20 @@ define amdgpu_kernel void @s_test_sdiv24_48(i48 addrspace(1)* %out, i48 %x, i48
 define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-LABEL: s_test_sdiv_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s2, s7, 31
-; GCN-NEXT:    s_add_u32 s0, s6, s2
-; GCN-NEXT:    s_mov_b32 s3, s2
-; GCN-NEXT:    s_addc_u32 s1, s7, s2
-; GCN-NEXT:    s_xor_b64 s[8:9], s[0:1], s[2:3]
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GCN-NEXT:    s_sub_u32 s3, 0, s8
-; GCN-NEXT:    s_subb_u32 s6, 0, s9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_ashr_i32 s8, s3, 31
+; GCN-NEXT:    s_add_u32 s2, s2, s8
+; GCN-NEXT:    s_mov_b32 s9, s8
+; GCN-NEXT:    s_addc_u32 s3, s3, s8
+; GCN-NEXT:    s_xor_b64 s[2:3], s[2:3], s[8:9]
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -1137,10 +1135,10 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s3, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v7, s6, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s3, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v7, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v6
@@ -1157,75 +1155,74 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
-; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v6, s3, v4
-; GCN-NEXT:    v_mul_hi_u32 v7, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s6, v0
-; GCN-NEXT:    s_mov_b32 s6, -1
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v2, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v3, v1, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
-; GCN-NEXT:    v_mov_b32_e32 v5, s9
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v2, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v1, s9, v0
-; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
+; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GCN-NEXT:    v_mul_lo_u32 v3, s8, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 24, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v3
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v5
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    v_xor_b32_e32 v0, s2, v0
-; GCN-NEXT:    v_xor_b32_e32 v1, s2, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s2, v0
+; GCN-NEXT:    v_xor_b32_e32 v0, s8, v0
+; GCN-NEXT:    v_xor_b32_e32 v1, s8, v1
+; GCN-NEXT:    v_mov_b32_e32 v2, s8
+; GCN-NEXT:    v_subrev_i32_e32 v0, vcc, s8, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v1, v2, vcc
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
@@ -1362,32 +1359,30 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v14, v12, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v9, vcc
-; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v7
-; GCN-NEXT:    v_addc_u32_e64 v7, vcc, v4, v8, s[4:5]
-; GCN-NEXT:    v_mul_lo_u32 v9, v5, v7
-; GCN-NEXT:    v_mul_hi_u32 v10, v5, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v4, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v7, v5, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v5, v3
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v3
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v3
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_mul_lo_u32 v9, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v10, v3, v5
+; GCN-NEXT:    v_mul_hi_u32 v11, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v8, v4, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GCN-NEXT:    v_mul_hi_u32 v7, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GCN-NEXT:    v_mul_lo_u32 v11, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v14, v3, v5
-; GCN-NEXT:    v_mul_hi_u32 v15, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v10, v7, v5
-; GCN-NEXT:    v_mul_lo_u32 v5, v7, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v7, v6
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GCN-NEXT:    v_addc_u32_e32 v14, vcc, v13, v15, vcc
-; GCN-NEXT:    v_mul_lo_u32 v6, v7, v6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v14, v10, vcc
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v12, vcc
+; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v13, v11, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v4, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v8, vcc
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v12, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v13, v7, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v4, v6, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, v4, 24
 ; GCN-NEXT:    v_mul_hi_u32 v3, v3, 24
 ; GCN-NEXT:    v_mul_hi_u32 v4, v4, 24
@@ -1540,6 +1535,7 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-NEXT:    v_mov_b32_e32 v13, 0
 ; GCN-NEXT:    v_mov_b32_e32 v12, 0
+; GCN-NEXT:    s_mov_b32 s4, 0x8000
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
 ; GCN-NEXT:    v_mul_f32_e32 v4, 0x2f800000, v3
 ; GCN-NEXT:    v_trunc_f32_e32 v4, v4
@@ -1566,32 +1562,30 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v14, v12, vcc
 ; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v9, vcc
-; GCN-NEXT:    v_add_i32_e64 v3, s[4:5], v3, v7
-; GCN-NEXT:    v_addc_u32_e64 v7, vcc, v4, v8, s[4:5]
-; GCN-NEXT:    v_mul_lo_u32 v9, v5, v7
-; GCN-NEXT:    v_mul_hi_u32 v10, v5, v3
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v4, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v7, v5, v4
+; GCN-NEXT:    v_mul_hi_u32 v8, v5, v3
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v3
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v3
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_mul_lo_u32 v9, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v10, v3, v5
+; GCN-NEXT:    v_mul_hi_u32 v11, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v8, v4, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GCN-NEXT:    v_mul_hi_u32 v7, v4, v6
 ; GCN-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GCN-NEXT:    v_mul_lo_u32 v11, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v14, v3, v5
-; GCN-NEXT:    v_mul_hi_u32 v15, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v10, v7, v5
-; GCN-NEXT:    v_mul_lo_u32 v5, v7, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v7, v6
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GCN-NEXT:    v_addc_u32_e32 v14, vcc, v13, v15, vcc
-; GCN-NEXT:    v_mul_lo_u32 v6, v7, v6
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v14, v10, vcc
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v9, v12, vcc
+; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v13, v11, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v4, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v8, vcc
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v7, v12, vcc
 ; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v13, v7, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v4, v6, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 17, v4
 ; GCN-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 17, v3
@@ -1599,7 +1593,6 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v13, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v3
-; GCN-NEXT:    s_mov_b32 s4, 0x8000
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v3
 ; GCN-NEXT:    v_sub_i32_e32 v6, vcc, 0, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 4cbff89f2dc4..e70184481b05 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -13,8 +13,8 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GCN-NEXT:    s_sub_u32 s2, 0, s12
-; GCN-NEXT:    s_subb_u32 s3, 0, s13
+; GCN-NEXT:    s_sub_u32 s0, 0, s12
+; GCN-NEXT:    s_subb_u32 s1, 0, s13
 ; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
@@ -26,10 +26,10 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s0, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v7, s1, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s0, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v6
@@ -46,32 +46,30 @@ define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
-; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
-; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v6, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s0, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s1, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v2, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, s10, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, s10, v0
 ; GCN-NEXT:    v_mul_hi_u32 v6, s10, v3
@@ -269,32 +267,30 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v15, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v14, v10, vcc
-; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
-; GCN-NEXT:    v_addc_u32_e64 v8, vcc, v5, v9, s[4:5]
-; GCN-NEXT:    v_mul_lo_u32 v10, v6, v8
-; GCN-NEXT:    v_mul_hi_u32 v11, v6, v4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v9, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v6, v5
+; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GCN-NEXT:    v_mul_lo_u32 v10, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v11, v4, v6
+; GCN-NEXT:    v_mul_hi_u32 v12, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v9, v5, v6
+; GCN-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GCN-NEXT:    v_mul_hi_u32 v8, v5, v7
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_mul_lo_u32 v12, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v15, v4, v6
-; GCN-NEXT:    v_mul_hi_u32 v16, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v11, v8, v6
-; GCN-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GCN-NEXT:    v_mul_hi_u32 v10, v8, v7
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
-; GCN-NEXT:    v_addc_u32_e32 v15, vcc, v14, v16, vcc
-; GCN-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v14, v12, vcc
+; GCN-NEXT:    v_mul_lo_u32 v7, v5, v7
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v14, v8, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v7, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
 ; GCN-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GCN-NEXT:    v_xor_b32_e32 v0, v0, v6
@@ -889,23 +885,25 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    s_xor_b64 s[12:13], s[4:5], s[0:1]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GCN-NEXT:    s_sub_u32 s4, 0, s12
-; GCN-NEXT:    s_subb_u32 s5, 0, s13
+; GCN-NEXT:    s_sub_u32 s0, 0, s12
+; GCN-NEXT:    s_subb_u32 s1, 0, s13
 ; GCN-NEXT:    s_ashr_i32 s10, s11, 31
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_mov_b32 s11, s10
+; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v2, v2
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v3, s4, v2
-; GCN-NEXT:    v_mul_hi_u32 v4, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
+; GCN-NEXT:    s_mov_b32 s5, s9
+; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s1, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s0, v0
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GCN-NEXT:    v_mul_hi_u32 v4, v0, v5
@@ -922,35 +920,32 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v3
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v5, s4, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
-; GCN-NEXT:    s_mov_b32 s5, s9
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v3, v6
-; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v8, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v7, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v5, s1, v0
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_mul_lo_u32 v4, s0, v0
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v3
+; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v7, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
 ; GCN-NEXT:    s_add_u32 s0, s2, s10
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GCN-NEXT:    s_addc_u32 s1, s3, s10
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
 ; GCN-NEXT:    s_xor_b64 s[14:15], s[0:1], s[10:11]
 ; GCN-NEXT:    v_mul_lo_u32 v3, s14, v2
 ; GCN-NEXT:    v_mul_hi_u32 v4, s14, v0
@@ -961,7 +956,6 @@ define amdgpu_kernel void @s_test_srem33_64(i64 addrspace(1)* %out, i64 %x, i64
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v7, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v5, s15, v0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s15, v0
-; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v4, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
@@ -1295,23 +1289,25 @@ define amdgpu_kernel void @s_test_srem24_48(i48 addrspace(1)* %out, i48 %x, i48
 define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-LABEL: s_test_srem_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_ashr_i32 s0, s7, 31
-; GCN-NEXT:    s_add_u32 s2, s6, s0
-; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_addc_u32 s3, s7, s0
-; GCN-NEXT:    s_xor_b64 s[8:9], s[2:3], s[0:1]
+; GCN-NEXT:    s_ashr_i32 s4, s3, 31
+; GCN-NEXT:    s_add_u32 s2, s2, s4
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    s_addc_u32 s3, s3, s4
+; GCN-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GCN-NEXT:    s_sub_u32 s2, 0, s8
 ; GCN-NEXT:    s_subb_u32 s3, 0, s9
-; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1338,32 +1334,30 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
-; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
-; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s3, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, s2, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v2, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v3, v1, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
@@ -1539,32 +1533,30 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v12, v8, vcc
-; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
-; GCN-NEXT:    v_addc_u32_e64 v6, vcc, v3, v7, s[4:5]
-; GCN-NEXT:    v_mul_lo_u32 v8, v4, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v10, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v5
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v13, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v14, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v6, v4
-; GCN-NEXT:    v_mul_hi_u32 v8, v6, v5
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GCN-NEXT:    v_addc_u32_e32 v13, vcc, v12, v14, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, v6, v5
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v12, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v12, v6, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v5, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, 24
 ; GCN-NEXT:    v_mul_hi_u32 v2, v2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v3, v3, 24
@@ -1715,6 +1707,7 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-NEXT:    v_mov_b32_e32 v12, 0
 ; GCN-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-NEXT:    s_mov_b32 s4, 0x8000
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
@@ -1741,32 +1734,30 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v12, v8, vcc
-; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
-; GCN-NEXT:    v_addc_u32_e64 v6, vcc, v3, v7, s[4:5]
-; GCN-NEXT:    v_mul_lo_u32 v8, v4, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v10, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v5
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v13, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v14, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v6, v4
-; GCN-NEXT:    v_mul_hi_u32 v8, v6, v5
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GCN-NEXT:    v_addc_u32_e32 v13, vcc, v12, v14, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, v6, v5
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v12, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v12, v6, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v5, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GCN-NEXT:    v_lshrrev_b32_e32 v4, 17, v3
 ; GCN-NEXT:    v_lshlrev_b32_e32 v3, 15, v3
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
@@ -1775,7 +1766,6 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
 ; GCN-NEXT:    v_mul_lo_u32 v2, v0, v2
-; GCN-NEXT:    s_mov_b32 s4, 0x8000
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v3
 ; GCN-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index fee867119bbd..d5291f792f26 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -5,16 +5,16 @@
 define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0xd
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0xd
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GCN-NEXT:    s_sub_u32 s4, 0, s2
-; GCN-NEXT:    s_subb_u32 s5, 0, s3
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
+; GCN-NEXT:    s_sub_u32 s4, 0, s8
+; GCN-NEXT:    s_subb_u32 s5, 0, s9
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -44,65 +44,63 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
-; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v6, s4, v4
-; GCN-NEXT:    v_mul_hi_u32 v7, s4, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s5, v0
-; GCN-NEXT:    s_mov_b32 s5, s9
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_mul_lo_u32 v7, s4, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v6, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, s10, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v6, s10, v3
-; GCN-NEXT:    v_mul_hi_u32 v7, s11, v3
-; GCN-NEXT:    v_mul_lo_u32 v3, s11, v3
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v2, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v6, s2, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, s3, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, s3, v3
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
-; GCN-NEXT:    v_mul_lo_u32 v6, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GCN-NEXT:    s_mov_b32 s4, s8
+; GCN-NEXT:    v_mul_lo_u32 v6, s3, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
+; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v5, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
-; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    v_mul_lo_u32 v2, s8, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s8, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s9, v0
+; GCN-NEXT:    v_mov_b32_e32 v5, s9
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s8, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s11, v2
-; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s10, v3
+; GCN-NEXT:    v_sub_i32_e32 v4, vcc, s3, v2
+; GCN-NEXT:    v_sub_i32_e32 v3, vcc, s2, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s8, v3
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s9, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s8, v5
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s9, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v1, s[0:1]
@@ -110,13 +108,13 @@ define amdgpu_kernel void @s_test_udiv_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_addc_u32_e64 v8, s[0:1], 0, v1, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v8, v6, s[0:1]
-; GCN-NEXT:    v_mov_b32_e32 v6, s11
+; GCN-NEXT:    v_mov_b32_e32 v6, s3
 ; GCN-NEXT:    v_subb_u32_e32 v2, vcc, v6, v2, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v2
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s9, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s8, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v2
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v2
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
@@ -254,32 +252,30 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v15, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v14, v10, vcc
-; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
-; GCN-NEXT:    v_addc_u32_e64 v8, vcc, v5, v9, s[4:5]
-; GCN-NEXT:    v_mul_lo_u32 v10, v6, v8
-; GCN-NEXT:    v_mul_hi_u32 v11, v6, v4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v9, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v6, v5
+; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GCN-NEXT:    v_mul_lo_u32 v10, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v11, v4, v6
+; GCN-NEXT:    v_mul_hi_u32 v12, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v9, v5, v6
+; GCN-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GCN-NEXT:    v_mul_hi_u32 v8, v5, v7
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_mul_lo_u32 v12, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v15, v4, v6
-; GCN-NEXT:    v_mul_hi_u32 v16, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v11, v8, v6
-; GCN-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GCN-NEXT:    v_mul_hi_u32 v10, v8, v7
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
-; GCN-NEXT:    v_addc_u32_e32 v15, vcc, v14, v16, vcc
-; GCN-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v14, v12, vcc
+; GCN-NEXT:    v_mul_lo_u32 v7, v5, v7
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v14, v8, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v7, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v5
@@ -719,14 +715,14 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GCN-NEXT:    s_sub_u32 s2, 0, s0
-; GCN-NEXT:    s_subb_u32 s3, 0, s1
-; GCN-NEXT:    v_mul_hi_u32 v3, s2, v1
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v2
-; GCN-NEXT:    v_mul_lo_u32 v5, s3, v1
+; GCN-NEXT:    s_sub_u32 s0, 0, s0
+; GCN-NEXT:    s_subb_u32 s1, 0, s1
+; GCN-NEXT:    v_mul_hi_u32 v3, s0, v1
+; GCN-NEXT:    v_mul_lo_u32 v4, s0, v2
+; GCN-NEXT:    v_mul_lo_u32 v5, s1, v1
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v1
+; GCN-NEXT:    v_mul_lo_u32 v4, s0, v1
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; GCN-NEXT:    v_mul_lo_u32 v6, v1, v3
 ; GCN-NEXT:    v_mul_hi_u32 v7, v1, v4
@@ -742,33 +738,31 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v8, vcc
 ; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v5, vcc
-; GCN-NEXT:    v_add_i32_e64 v1, s[0:1], v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v2, v4, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v5, s2, v3
-; GCN-NEXT:    v_mul_hi_u32 v6, s2, v1
-; GCN-NEXT:    v_mul_lo_u32 v7, s3, v1
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v1
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GCN-NEXT:    v_mul_lo_u32 v11, v1, v5
-; GCN-NEXT:    v_mul_hi_u32 v12, v1, v6
-; GCN-NEXT:    v_mul_hi_u32 v13, v1, v5
-; GCN-NEXT:    v_mul_hi_u32 v10, v3, v6
-; GCN-NEXT:    v_mul_lo_u32 v6, v3, v6
-; GCN-NEXT:    v_mul_hi_u32 v7, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GCN-NEXT:    v_addc_u32_e32 v12, vcc, v9, v13, vcc
-; GCN-NEXT:    v_mul_lo_u32 v3, v3, v5
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v11, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v12, v10, vcc
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v8, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v5, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v2, v5, s[0:1]
+; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, s0, v2
+; GCN-NEXT:    v_mul_hi_u32 v4, s0, v1
+; GCN-NEXT:    v_mul_lo_u32 v5, s1, v1
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_mul_lo_u32 v4, s0, v1
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_mul_lo_u32 v7, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v10, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v11, v1, v3
+; GCN-NEXT:    v_mul_hi_u32 v6, v2, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v5, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v9, v11, vcc
+; GCN-NEXT:    v_mul_lo_u32 v3, v2, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v10, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v8, vcc
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v5, vcc
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GCN-NEXT:    v_mov_b32_e32 v3, s8
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v2, v4, vcc
 ; GCN-NEXT:    v_alignbit_b32 v3, s6, v3, 24
 ; GCN-NEXT:    v_mul_lo_u32 v4, v3, v2
 ; GCN-NEXT:    v_mul_hi_u32 v1, v3, v1
@@ -919,30 +913,28 @@ define amdgpu_kernel void @s_test_udiv24_i48(i48 addrspace(1)* %out, i48 %x, i48
 define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x) {
 ; GCN-LABEL: s_test_udiv_k_num_i64:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    s_mov_b32 s6, -1
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GCN-NEXT:    s_sub_u32 s2, 0, s6
-; GCN-NEXT:    s_subb_u32 s3, 0, s7
-; GCN-NEXT:    s_mov_b32 s8, s4
+; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GCN-NEXT:    s_sub_u32 s4, 0, s2
+; GCN-NEXT:    s_subb_u32 s5, 0, s3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_mov_b32 s9, s5
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GCN-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v0
 ; GCN-NEXT:    v_trunc_f32_e32 v3, v3
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v7, s5, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s4, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v6
@@ -959,71 +951,71 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
-; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
-; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s4, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s4, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s5, v0
+; GCN-NEXT:    s_mov_b32 s5, s1
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, s4, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v2, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v3, v1, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
-; GCN-NEXT:    v_mov_b32_e32 v5, s7
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v2, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v1, s7, v0
-; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
+; GCN-NEXT:    v_mul_lo_u32 v1, s3, v0
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
-; GCN-NEXT:    v_mul_lo_u32 v3, s6, v0
+; GCN-NEXT:    v_mul_lo_u32 v3, s2, v0
 ; GCN-NEXT:    v_sub_i32_e32 v4, vcc, 0, v1
 ; GCN-NEXT:    v_sub_i32_e32 v3, vcc, 24, v3
 ; GCN-NEXT:    v_subb_u32_e64 v4, s[0:1], v4, v5, vcc
-; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s6, v3
+; GCN-NEXT:    v_subrev_i32_e64 v5, s[0:1], s2, v3
 ; GCN-NEXT:    v_subbrev_u32_e64 v4, s[0:1], 0, v4, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s7, v4
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s3, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v5
+; GCN-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v5
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
-; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s7, v4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[0:1], s3, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v6, v5, s[0:1]
 ; GCN-NEXT:    v_add_i32_e64 v5, s[0:1], 2, v0
 ; GCN-NEXT:    v_addc_u32_e64 v6, s[0:1], 0, v2, s[0:1]
 ; GCN-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_addc_u32_e64 v2, s[0:1], 0, v2, s[0:1]
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s7, v1
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s3, v1
 ; GCN-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
-; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s6, v3
+; GCN-NEXT:    v_cmp_le_u32_e32 vcc, s2, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
-; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v1
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, s3, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v7, v5, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_udiv_k_num_i64:
@@ -1148,32 +1140,30 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v12, v8, vcc
-; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
-; GCN-NEXT:    v_addc_u32_e64 v6, vcc, v3, v7, s[4:5]
-; GCN-NEXT:    v_mul_lo_u32 v8, v4, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v10, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v5
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v13, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v14, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v6, v4
-; GCN-NEXT:    v_mul_hi_u32 v8, v6, v5
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GCN-NEXT:    v_addc_u32_e32 v13, vcc, v12, v14, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, v6, v5
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v12, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v12, v6, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v5, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v3, v5, vcc
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2
@@ -1390,7 +1380,7 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x41c00000
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_movk_i32 s2, 0xffe8
+; GCN-NEXT:    s_movk_i32 s4, 0xffe8
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1399,12 +1389,13 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0x9
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
 ; GCN-NEXT:    s_mov_b32 s7, 0xf000
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, s2
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
@@ -1421,43 +1412,40 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v5, v2, s2
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, s2
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s4, s8
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GCN-NEXT:    s_mov_b32 s5, s9
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
-; GCN-NEXT:    v_mul_lo_u32 v10, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s10, v0
-; GCN-NEXT:    v_mul_hi_u32 v4, s10, v1
-; GCN-NEXT:    v_mul_hi_u32 v5, s11, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s11, v1
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s3, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s3, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, s11, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s11, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
@@ -1471,8 +1459,8 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; GCN-NEXT:    v_mov_b32_e32 v5, s11
-; GCN-NEXT:    v_sub_i32_e32 v8, vcc, s10, v8
+; GCN-NEXT:    v_mov_b32_e32 v5, s3
+; GCN-NEXT:    v_sub_i32_e32 v8, vcc, s2, v8
 ; GCN-NEXT:    v_subb_u32_e32 v4, vcc, v5, v4, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v5, vcc, 24, v8
 ; GCN-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
@@ -1578,7 +1566,7 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GCN-NEXT:    v_madak_f32 v2, 0, v2, 0x41c00000
 ; GCN-NEXT:    v_rcp_f32_e32 v2, v2
-; GCN-NEXT:    s_movk_i32 s6, 0xffe8
+; GCN-NEXT:    s_movk_i32 s4, 0xffe8
 ; GCN-NEXT:    v_mov_b32_e32 v10, 0
 ; GCN-NEXT:    v_mov_b32_e32 v9, 0
 ; GCN-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
@@ -1587,9 +1575,9 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GCN-NEXT:    v_mul_hi_u32 v4, v2, s6
-; GCN-NEXT:    v_mul_lo_u32 v5, v3, s6
-; GCN-NEXT:    v_mul_lo_u32 v6, v2, s6
+; GCN-NEXT:    v_mul_hi_u32 v4, v2, s4
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, s4
+; GCN-NEXT:    v_mul_lo_u32 v6, v2, s4
 ; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_mul_lo_u32 v7, v2, v4
@@ -1606,31 +1594,29 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v6, vcc
-; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v4
-; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[4:5]
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, s6
-; GCN-NEXT:    v_mul_lo_u32 v7, v4, s6
-; GCN-NEXT:    v_mul_lo_u32 v8, v2, s6
-; GCN-NEXT:    v_subrev_i32_e32 v6, vcc, v2, v6
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v11, v2, v8
-; GCN-NEXT:    v_mul_hi_u32 v12, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v13, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v10, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v12, v4, v8
-; GCN-NEXT:    v_mul_hi_u32 v8, v4, v8
-; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v8, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v13, v9, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v6, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v6, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT:    v_mul_hi_u32 v4, v2, s4
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, s4
+; GCN-NEXT:    v_mul_lo_u32 v6, v2, s4
+; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v2, v4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, v2, v6
+; GCN-NEXT:    v_mul_hi_u32 v8, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v11, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v10, v8, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v7, v6, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v10, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, v0, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v2
 ; GCN-NEXT:    v_mul_hi_u32 v6, v0, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index ffed673abfc2..3d501dc4074e 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -13,8 +13,8 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s12
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s13
-; GCN-NEXT:    s_sub_u32 s2, 0, s12
-; GCN-NEXT:    s_subb_u32 s3, 0, s13
+; GCN-NEXT:    s_sub_u32 s0, 0, s12
+; GCN-NEXT:    s_subb_u32 s1, 0, s13
 ; GCN-NEXT:    s_mov_b32 s4, s8
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
@@ -26,10 +26,10 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s0, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v7, s1, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s0, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v6
@@ -46,32 +46,30 @@ define amdgpu_kernel void @s_test_urem_i64(i64 addrspace(1)* %out, i64 %x, i64 %
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v9, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
-; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
-; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v2, v6, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v6, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s0, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s1, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v2, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v4, s10, v3
 ; GCN-NEXT:    v_mul_hi_u32 v5, s10, v0
 ; GCN-NEXT:    v_mul_hi_u32 v6, s10, v3
@@ -264,32 +262,30 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-NEXT:    v_addc_u32_e32 v10, vcc, v15, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v14, v10, vcc
-; GCN-NEXT:    v_add_i32_e64 v4, s[4:5], v4, v8
-; GCN-NEXT:    v_addc_u32_e64 v8, vcc, v5, v9, s[4:5]
-; GCN-NEXT:    v_mul_lo_u32 v10, v6, v8
-; GCN-NEXT:    v_mul_hi_u32 v11, v6, v4
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v9, vcc
+; GCN-NEXT:    v_mul_lo_u32 v8, v6, v5
+; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
 ; GCN-NEXT:    v_mul_lo_u32 v7, v7, v4
 ; GCN-NEXT:    v_mul_lo_u32 v6, v6, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GCN-NEXT:    v_mul_lo_u32 v10, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v11, v4, v6
+; GCN-NEXT:    v_mul_hi_u32 v12, v4, v7
+; GCN-NEXT:    v_mul_hi_u32 v9, v5, v6
+; GCN-NEXT:    v_mul_lo_u32 v6, v5, v6
+; GCN-NEXT:    v_mul_hi_u32 v8, v5, v7
 ; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_mul_lo_u32 v12, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v15, v4, v6
-; GCN-NEXT:    v_mul_hi_u32 v16, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v11, v8, v6
-; GCN-NEXT:    v_mul_lo_u32 v6, v8, v6
-; GCN-NEXT:    v_mul_hi_u32 v10, v8, v7
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
-; GCN-NEXT:    v_addc_u32_e32 v15, vcc, v14, v16, vcc
-; GCN-NEXT:    v_mul_lo_u32 v7, v8, v7
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v12, v6
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v15, v11, vcc
-; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v10, v13, vcc
+; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v14, v12, vcc
+; GCN-NEXT:    v_mul_lo_u32 v7, v5, v7
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v11, v9, vcc
+; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v14, v8, vcc
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; GCN-NEXT:    v_addc_u32_e64 v5, vcc, v5, v7, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v5, v7, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v6, v0, v5
 ; GCN-NEXT:    v_mul_hi_u32 v7, v0, v4
 ; GCN-NEXT:    v_mul_hi_u32 v8, v0, v5
@@ -747,8 +743,8 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GCN-NEXT:    s_sub_u32 s2, 0, s6
-; GCN-NEXT:    s_subb_u32 s3, 0, s7
+; GCN-NEXT:    s_sub_u32 s0, 0, s6
+; GCN-NEXT:    s_subb_u32 s1, 0, s7
 ; GCN-NEXT:    s_mov_b32 s8, s4
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
@@ -760,10 +756,10 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_mul_lo_u32 v4, s2, v3
-; GCN-NEXT:    v_mul_hi_u32 v5, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v7, s3, v0
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s0, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v7, s1, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s0, v0
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; GCN-NEXT:    v_mul_hi_u32 v5, v0, v6
@@ -780,32 +776,30 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v10, v1, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v2, v6, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v4
-; GCN-NEXT:    v_addc_u32_e64 v4, vcc, v3, v5, s[0:1]
-; GCN-NEXT:    v_mul_lo_u32 v6, s2, v4
-; GCN-NEXT:    v_mul_hi_u32 v7, s2, v0
-; GCN-NEXT:    v_mul_lo_u32 v8, s3, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GCN-NEXT:    v_mul_lo_u32 v7, s2, v0
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GCN-NEXT:    v_mul_lo_u32 v10, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v11, v0, v7
-; GCN-NEXT:    v_mul_hi_u32 v12, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v7
-; GCN-NEXT:    v_mul_lo_u32 v7, v4, v7
-; GCN-NEXT:    v_mul_hi_u32 v8, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GCN-NEXT:    v_addc_u32_e32 v11, vcc, v2, v12, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, v4, v6
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v11, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, s0, v3
+; GCN-NEXT:    v_mul_hi_u32 v5, s0, v0
+; GCN-NEXT:    v_mul_lo_u32 v6, s1, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; GCN-NEXT:    v_mul_lo_u32 v5, s0, v0
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
+; GCN-NEXT:    v_mul_lo_u32 v8, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v9, v0, v5
+; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v5
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v2, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v6, v1, vcc
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v3, v1, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v3, v1, vcc
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, v0, 24
 ; GCN-NEXT:    v_mul_hi_u32 v1, v1, 24
@@ -941,7 +935,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0x4f800000
 ; GCN-NEXT:    v_madak_f32 v0, 0, v0, 0x41c00000
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-NEXT:    s_movk_i32 s2, 0xffe8
+; GCN-NEXT:    s_movk_i32 s4, 0xffe8
 ; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_mov_b32_e32 v7, 0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -950,12 +944,13 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GCN-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GCN-NEXT:    s_mov_b32 s11, 0xf000
-; GCN-NEXT:    v_mul_hi_u32 v2, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v3, v1, s2
-; GCN-NEXT:    v_mul_lo_u32 v4, v0, s2
-; GCN-NEXT:    s_mov_b32 s10, -1
+; GCN-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
+; GCN-NEXT:    s_mov_b32 s7, 0xf000
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_mul_lo_u32 v5, v0, v2
@@ -972,43 +967,40 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e64 v0, s[0:1], v0, v2
-; GCN-NEXT:    v_addc_u32_e64 v2, vcc, v1, v3, s[0:1]
-; GCN-NEXT:    v_mul_hi_u32 v4, v0, s2
-; GCN-NEXT:    v_mul_lo_u32 v5, v2, s2
-; GCN-NEXT:    v_mul_lo_u32 v6, v0, s2
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s8, s4
-; GCN-NEXT:    v_subrev_i32_e32 v4, vcc, v0, v4
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GCN-NEXT:    v_mul_lo_u32 v5, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v9, v0, v6
-; GCN-NEXT:    v_mul_hi_u32 v10, v0, v4
-; GCN-NEXT:    v_mul_hi_u32 v11, v2, v4
-; GCN-NEXT:    s_mov_b32 s9, s5
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v8, v10, vcc
-; GCN-NEXT:    v_mul_lo_u32 v10, v2, v6
-; GCN-NEXT:    v_mul_hi_u32 v6, v2, v6
-; GCN-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v9, v6, vcc
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v11, v7, vcc
-; GCN-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v8, v4, vcc
-; GCN-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GCN-NEXT:    v_addc_u32_e64 v1, vcc, v1, v4, s[0:1]
 ; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN-NEXT:    v_mul_lo_u32 v2, s6, v1
-; GCN-NEXT:    v_mul_hi_u32 v3, s6, v0
-; GCN-NEXT:    v_mul_hi_u32 v4, s6, v1
-; GCN-NEXT:    v_mul_hi_u32 v5, s7, v1
-; GCN-NEXT:    v_mul_lo_u32 v1, s7, v1
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_hi_u32 v2, v0, s4
+; GCN-NEXT:    v_mul_lo_u32 v3, v1, s4
+; GCN-NEXT:    v_mul_lo_u32 v4, v0, s4
+; GCN-NEXT:    s_mov_b32 s4, s0
+; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, v0, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; GCN-NEXT:    v_mul_lo_u32 v3, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GCN-NEXT:    v_mul_hi_u32 v6, v0, v2
+; GCN-NEXT:    v_mul_hi_u32 v9, v1, v2
+; GCN-NEXT:    v_mul_lo_u32 v2, v1, v2
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v8, v6, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v1, v4
+; GCN-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GCN-NEXT:    s_mov_b32 s6, -1
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v5, v4, vcc
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
+; GCN-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GCN-NEXT:    v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; GCN-NEXT:    v_mul_lo_u32 v2, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v3, s2, v0
+; GCN-NEXT:    v_mul_hi_u32 v4, s2, v1
+; GCN-NEXT:    v_mul_hi_u32 v5, s3, v1
+; GCN-NEXT:    v_mul_lo_u32 v1, s3, v1
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v8, v4, vcc
-; GCN-NEXT:    v_mul_lo_u32 v4, s7, v0
-; GCN-NEXT:    v_mul_hi_u32 v0, s7, v0
+; GCN-NEXT:    v_mul_lo_u32 v4, s3, v0
+; GCN-NEXT:    v_mul_hi_u32 v0, s3, v0
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GCN-NEXT:    v_addc_u32_e32 v0, vcc, v3, v0, vcc
 ; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v5, v7, vcc
@@ -1018,8 +1010,8 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_mul_hi_u32 v2, v0, 24
 ; GCN-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GCN-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
-; GCN-NEXT:    v_mov_b32_e32 v2, s7
-; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s6, v0
+; GCN-NEXT:    v_mov_b32_e32 v2, s3
+; GCN-NEXT:    v_sub_i32_e32 v0, vcc, s2, v0
 ; GCN-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v0
 ; GCN-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v1, vcc
@@ -1039,7 +1031,7 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(i64 addrspace(1)* %out, i64 %x)
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
-; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
+; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
 ; GCN-IR-LABEL: s_test_urem_k_den_i64:
@@ -1173,32 +1165,30 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-NEXT:    v_addc_u32_e32 v8, vcc, v13, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GCN-NEXT:    v_addc_u32_e32 v7, vcc, v12, v8, vcc
-; GCN-NEXT:    v_add_i32_e64 v2, s[4:5], v2, v6
-; GCN-NEXT:    v_addc_u32_e64 v6, vcc, v3, v7, s[4:5]
-; GCN-NEXT:    v_mul_lo_u32 v8, v4, v6
-; GCN-NEXT:    v_mul_hi_u32 v9, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; GCN-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
+; GCN-NEXT:    v_mul_lo_u32 v6, v4, v3
+; GCN-NEXT:    v_mul_hi_u32 v7, v4, v2
 ; GCN-NEXT:    v_mul_lo_u32 v5, v5, v2
 ; GCN-NEXT:    v_mul_lo_u32 v4, v4, v2
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GCN-NEXT:    v_mul_lo_u32 v8, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v9, v2, v4
+; GCN-NEXT:    v_mul_hi_u32 v10, v2, v5
+; GCN-NEXT:    v_mul_hi_u32 v7, v3, v4
+; GCN-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GCN-NEXT:    v_mul_hi_u32 v6, v3, v5
 ; GCN-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GCN-NEXT:    v_mul_lo_u32 v10, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v13, v2, v4
-; GCN-NEXT:    v_mul_hi_u32 v14, v2, v5
-; GCN-NEXT:    v_mul_hi_u32 v9, v6, v4
-; GCN-NEXT:    v_mul_lo_u32 v4, v6, v4
-; GCN-NEXT:    v_mul_hi_u32 v8, v6, v5
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
-; GCN-NEXT:    v_addc_u32_e32 v13, vcc, v12, v14, vcc
-; GCN-NEXT:    v_mul_lo_u32 v5, v6, v5
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v13, v9, vcc
-; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v8, v11, vcc
+; GCN-NEXT:    v_addc_u32_e32 v9, vcc, v12, v10, vcc
+; GCN-NEXT:    v_mul_lo_u32 v5, v3, v5
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
+; GCN-NEXT:    v_addc_u32_e32 v4, vcc, v9, v7, vcc
+; GCN-NEXT:    v_addc_u32_e32 v6, vcc, v6, v11, vcc
 ; GCN-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GCN-NEXT:    v_addc_u32_e32 v5, vcc, v12, v6, vcc
-; GCN-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
-; GCN-NEXT:    v_addc_u32_e64 v3, vcc, v3, v5, s[4:5]
 ; GCN-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_u32_e32 v2, vcc, 0, v3, vcc
+; GCN-NEXT:    v_addc_u32_e32 v2, vcc, v3, v5, vcc
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
 ; GCN-NEXT:    v_mul_lo_u32 v3, v1, v2
 ; GCN-NEXT:    v_mul_hi_u32 v4, v0, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 1c41f142b372..712ad6d990d3 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -330,26 +330,24 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_udiv64:
-; GFX1032: v_add_co_u32 v{{[0-9]+}}, [[SDST:s[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
-; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo
-; GFX1032: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]]
 ; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo
 ; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, vcc_lo
+; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo
 ; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, 0, v{{[0-9]+}}, vcc_lo
-; GFX1032: v_sub_co_u32 v{{[0-9]+}}, vcc_lo, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX1032: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, s{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo
-; GFX1032: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc_lo
-; GFX1064: v_add_co_u32 v{{[0-9]+}}, [[SDST:s\[[0-9:]+\]]], v{{[0-9]+}}, v{{[0-9]+}}
-; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
-; GFX1064: v_add_co_ci_u32_e64 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, [[SDST]]
+; GFX1032: v_add_co_u32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX1032: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc_lo, v{{[0-9]+}}, v{{[0-9]+}}, vcc_lo
 ; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
 ; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}}
+; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
 ; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc{{$}}
-; GFX1064: v_sub_co_u32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-; GFX1064: v_subrev_co_ci_u32_e64 v{{[0-9]+}}, s[{{[0-9:]+}}], {{[vs][0-9]+}}, v{{[0-9]+}}, vcc
-; GFX1064: v_sub_co_ci_u32_e32 v{{[0-9]+}}, vcc, {{[vs][0-9]+}}, v{{[0-9]+}}, vcc
+; GFX1064: v_add_co_u32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX1064: v_add_co_ci_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc{{$}}
 define amdgpu_kernel void @test_udiv64(i64 addrspace(1)* %arg) #0 {
 bb:
   %tmp = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 1


        


More information about the llvm-commits mailing list