[llvm] [AMDGPU] Enable sinking of free vector ops that will be folded into their uses (PR #162580)

Gheorghe-Teodor Bercea via llvm-commits llvm-commits at lists.llvm.org
Mon Oct 27 12:35:26 PDT 2025


https://github.com/doru1004 updated https://github.com/llvm/llvm-project/pull/162580

>From 0cf29f3b9edaf34e548c4f0ca984cec6d38e2ea5 Mon Sep 17 00:00:00 2001
From: Gheorghe-Teodor Bercea <dobercea at amd.com>
Date: Wed, 8 Oct 2025 17:45:43 -0500
Subject: [PATCH] Allow sinking of free vector ops

---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |  81 ++
 llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll   | 120 +--
 llvm/test/CodeGen/AMDGPU/frem.ll              | 930 +++++++++---------
 llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll  |  21 +-
 llvm/test/CodeGen/AMDGPU/srem.ll              | 808 +++++++--------
 .../AMDGPU/undef-handling-crash-in-ra.ll      |   2 -
 6 files changed, 996 insertions(+), 966 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 03d16fdd54c42..27435e9c141d1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1301,6 +1301,87 @@ bool GCNTTIImpl::isProfitableToSinkOperands(Instruction *I,
 
     if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
       Ops.push_back(&Op);
+
+    // Zero cost vector instructions (e.g. extractelement 0 of i32 vectors)
+    // will be optimized away, and sinking them can help SDAG combines.
+    const DataLayout &DL = I->getModule()->getDataLayout();
+
+    uint64_t VecIndex;
+    Value *Vec;
+    if (match(Op.get(), m_ExtractElt(m_Value(Vec), m_ConstantInt(VecIndex)))) {
+      Instruction *OpInst = cast<Instruction>(Op.get());
+      Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
+      // If a zero cost extractvector instruction is the only use of the vector,
+      // then it may be combined with the def.
+      if (VecOpInst && VecOpInst->hasOneUse())
+        continue;
+
+      if (getVectorInstrCost(OpInst->getOpcode(), Vec->getType(),
+                             TTI::TCK_RecipThroughput, VecIndex,
+                             OpInst->getOperand(0), OpInst->getOperand(1)) == 0)
+        Ops.push_back(&Op);
+
+      continue;
+    }
+
+    if (match(Op.get(),
+              m_InsertElt(m_Value(Vec), m_Value(), m_ConstantInt(VecIndex)))) {
+      Instruction *OpInst = cast<Instruction>(Op.get());
+      if (getVectorInstrCost(OpInst->getOpcode(), Vec->getType(),
+                             TTI::TCK_RecipThroughput, VecIndex,
+                             OpInst->getOperand(0), OpInst->getOperand(1)) == 0)
+        Ops.push_back(&Op);
+
+      continue;
+    }
+
+    if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
+      if (Shuffle->isIdentity()) {
+        Ops.push_back(&Op);
+        continue;
+      }
+
+      unsigned EltSize = DL.getTypeSizeInBits(
+          cast<VectorType>(cast<VectorType>(Shuffle->getType()))
+              ->getElementType());
+
+      // For i32 (or greater) shufflevectors, these will be lowered into a
+      // series of insert / extract elements, which will be coalesced away.
+      if (EltSize >= 32) {
+        Ops.push_back(&Op);
+        continue;
+      }
+
+      if (EltSize < 16 || !ST->has16BitInsts())
+        continue;
+
+      int NumSubElts, SubIndex;
+      if (Shuffle->changesLength()) {
+        if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
+          Ops.push_back(&Op);
+          continue;
+        }
+
+        if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
+             Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
+            !(SubIndex & 0x1)) {
+          Ops.push_back(&Op);
+          continue;
+        }
+      }
+
+      if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
+          Shuffle->isSingleSource()) {
+        Ops.push_back(&Op);
+        continue;
+      }
+
+      if (Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex) &&
+          !(SubIndex & 0x1)) {
+        Ops.push_back(&Op);
+        continue;
+      }
+    }
   }
 
   return !Ops.empty();
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index a43bfb5d45679..2fc4427745632 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -2146,11 +2146,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB11_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else16
 ; CI-NEXT:    s_and_b32 s6, s2, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v1, s4
-; CI-NEXT:    v_mov_b32_e32 v0, s2
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; CI-NEXT:    v_mov_b32_e32 v1, s6
-; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s4
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v0|
+; CI-NEXT:    v_mov_b32_e32 v0, s6
+; CI-NEXT:    v_mov_b32_e32 v1, s2
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CI-NEXT:    s_mov_b32 s6, 0
 ; CI-NEXT:  .LBB11_2: ; %Flow53
 ; CI-NEXT:    s_xor_b32 s6, s6, 1
@@ -2221,11 +2221,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB11_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else
 ; CI-NEXT:    s_and_b32 s6, s3, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v2, s5
-; CI-NEXT:    v_mov_b32_e32 v1, s3
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v2|
-; CI-NEXT:    v_mov_b32_e32 v2, s6
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT:    v_mov_b32_e32 v1, s5
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s6
+; CI-NEXT:    v_mov_b32_e32 v2, s3
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    s_mov_b32 s6, 0
 ; CI-NEXT:  .LBB11_10: ; %Flow49
 ; CI-NEXT:    s_xor_b32 s6, s6, 1
@@ -2319,11 +2319,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB11_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else16
 ; VI-NEXT:    s_and_b32 s6, s2, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v1, s4
-; VI-NEXT:    v_mov_b32_e32 v0, s2
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v1|
-; VI-NEXT:    v_mov_b32_e32 v1, s6
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s4
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s2|, |v0|
+; VI-NEXT:    v_mov_b32_e32 v0, s6
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; VI-NEXT:    s_mov_b32 s6, 0
 ; VI-NEXT:  .LBB11_2: ; %Flow53
 ; VI-NEXT:    s_xor_b32 s6, s6, 1
@@ -2394,11 +2394,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB11_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else
 ; VI-NEXT:    s_and_b32 s6, s3, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v2, s5
-; VI-NEXT:    v_mov_b32_e32 v1, s3
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v2|
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, s5
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s3|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s6
+; VI-NEXT:    v_mov_b32_e32 v2, s3
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    s_mov_b32 s6, 0
 ; VI-NEXT:  .LBB11_10: ; %Flow49
 ; VI-NEXT:    s_xor_b32 s6, s6, 1
@@ -2500,11 +2500,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB12_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else78
 ; CI-NEXT:    s_and_b32 s2, s4, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v1, s8
-; CI-NEXT:    v_mov_b32_e32 v0, s4
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v1|
-; CI-NEXT:    v_mov_b32_e32 v1, s2
-; CI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; CI-NEXT:    v_mov_b32_e32 v0, s8
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v0|
+; CI-NEXT:    v_mov_b32_e32 v0, s2
+; CI-NEXT:    v_mov_b32_e32 v1, s4
+; CI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB12_2: ; %Flow127
 ; CI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2575,11 +2575,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB12_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else47
 ; CI-NEXT:    s_and_b32 s2, s5, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v2, s9
-; CI-NEXT:    v_mov_b32_e32 v1, s5
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v2|
-; CI-NEXT:    v_mov_b32_e32 v2, s2
-; CI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; CI-NEXT:    v_mov_b32_e32 v1, s9
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v1|
+; CI-NEXT:    v_mov_b32_e32 v1, s2
+; CI-NEXT:    v_mov_b32_e32 v2, s5
+; CI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB12_10: ; %Flow123
 ; CI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2650,11 +2650,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB12_18
 ; CI-NEXT:  ; %bb.17: ; %frem.else16
 ; CI-NEXT:    s_and_b32 s2, s6, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v3, s10
-; CI-NEXT:    v_mov_b32_e32 v2, s6
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v3|
-; CI-NEXT:    v_mov_b32_e32 v3, s2
-; CI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; CI-NEXT:    v_mov_b32_e32 v2, s10
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v2|
+; CI-NEXT:    v_mov_b32_e32 v2, s2
+; CI-NEXT:    v_mov_b32_e32 v3, s6
+; CI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB12_18: ; %Flow119
 ; CI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2725,11 +2725,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB12_26
 ; CI-NEXT:  ; %bb.25: ; %frem.else
 ; CI-NEXT:    s_and_b32 s2, s7, 0x80000000
-; CI-NEXT:    v_mov_b32_e32 v4, s11
-; CI-NEXT:    v_mov_b32_e32 v3, s7
-; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v4|
-; CI-NEXT:    v_mov_b32_e32 v4, s2
-; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; CI-NEXT:    v_mov_b32_e32 v3, s11
+; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v3|
+; CI-NEXT:    v_mov_b32_e32 v3, s2
+; CI-NEXT:    v_mov_b32_e32 v4, s7
+; CI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; CI-NEXT:    s_mov_b32 s2, 0
 ; CI-NEXT:  .LBB12_26: ; %Flow115
 ; CI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2831,11 +2831,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB12_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else78
 ; VI-NEXT:    s_and_b32 s2, s4, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v1, s8
-; VI-NEXT:    v_mov_b32_e32 v0, s4
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v1|
-; VI-NEXT:    v_mov_b32_e32 v1, s2
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; VI-NEXT:    v_mov_b32_e32 v0, s8
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s4|, |v0|
+; VI-NEXT:    v_mov_b32_e32 v0, s2
+; VI-NEXT:    v_mov_b32_e32 v1, s4
+; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; VI-NEXT:    s_mov_b32 s2, 0
 ; VI-NEXT:  .LBB12_2: ; %Flow127
 ; VI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2906,11 +2906,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB12_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else47
 ; VI-NEXT:    s_and_b32 s2, s5, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v2, s9
-; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v2|
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-NEXT:    v_mov_b32_e32 v1, s9
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s5|, |v1|
+; VI-NEXT:    v_mov_b32_e32 v1, s2
+; VI-NEXT:    v_mov_b32_e32 v2, s5
+; VI-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
 ; VI-NEXT:    s_mov_b32 s2, 0
 ; VI-NEXT:  .LBB12_10: ; %Flow123
 ; VI-NEXT:    s_xor_b32 s2, s2, 1
@@ -2981,11 +2981,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB12_18
 ; VI-NEXT:  ; %bb.17: ; %frem.else16
 ; VI-NEXT:    s_and_b32 s2, s6, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v3, s10
-; VI-NEXT:    v_mov_b32_e32 v2, s6
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v3|
-; VI-NEXT:    v_mov_b32_e32 v3, s2
-; VI-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; VI-NEXT:    v_mov_b32_e32 v2, s10
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s6|, |v2|
+; VI-NEXT:    v_mov_b32_e32 v2, s2
+; VI-NEXT:    v_mov_b32_e32 v3, s6
+; VI-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; VI-NEXT:    s_mov_b32 s2, 0
 ; VI-NEXT:  .LBB12_18: ; %Flow119
 ; VI-NEXT:    s_xor_b32 s2, s2, 1
@@ -3056,11 +3056,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_cbranch_vccz .LBB12_26
 ; VI-NEXT:  ; %bb.25: ; %frem.else
 ; VI-NEXT:    s_and_b32 s2, s7, 0x80000000
-; VI-NEXT:    v_mov_b32_e32 v4, s11
-; VI-NEXT:    v_mov_b32_e32 v3, s7
-; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v4|
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; VI-NEXT:    v_mov_b32_e32 v3, s11
+; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |s7|, |v3|
+; VI-NEXT:    v_mov_b32_e32 v3, s2
+; VI-NEXT:    v_mov_b32_e32 v4, s7
+; VI-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
 ; VI-NEXT:    s_mov_b32 s2, 0
 ; VI-NEXT:  .LBB12_26: ; %Flow115
 ; VI-NEXT:    s_xor_b32 s2, s2, 1
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index d8cbdb11a911f..7230cb6208a2b 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -5783,11 +5783,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v3
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB9_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %frem.else20
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v3
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v5, v2
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v2, v5
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX11-TRUE16-NEXT:    s_branch .LBB9_8
@@ -6219,12 +6219,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s6, s5
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX1150-TRUE16-NEXT:  ; %bb.1: ; %frem.else20
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s4
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s6, s5
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, v0.l, s7
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB9_8
@@ -6686,12 +6686,12 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s6, s5
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX1200-TRUE16-NEXT:  ; %bb.1: ; %frem.else20
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s4
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s6, s5
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1200-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, v0.l, s7
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB9_8
@@ -8956,11 +8956,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v5
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB10_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %frem.else86
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v0.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v5
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, v7, v4
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, v4, v7
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.l, v4.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX11-TRUE16-NEXT:    s_branch .LBB10_8
@@ -9791,12 +9791,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s8, s6
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_2
 ; GFX1150-TRUE16-NEXT:  ; %bb.1: ; %frem.else86
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s5
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s5
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s8, s6
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, s5, v0.l, s9
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB10_8
@@ -10694,12 +10694,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s8, s6
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_2
 ; GFX1200-TRUE16-NEXT:  ; %bb.1: ; %frem.else86
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s5
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
+; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s5
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s8, s6
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1200-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
 ; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v0.l, s5, v0.l, s9
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB10_8
@@ -12687,18 +12687,18 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150:       ; %bb.0:
 ; GFX1150-NEXT:    s_clause 0x1
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1150-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
+; GFX1150-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
 ; GFX1150-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX1150-NEXT:    global_load_b64 v[1:2], v2, s[6:7] offset:32
 ; GFX1150-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX1150-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX1150-NEXT:    global_load_b64 v[0:1], v2, s[8:9] offset:32
 ; GFX1150-NEXT:    s_and_b32 s3, s6, 0x7fffffff
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX1150-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX1150-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1150-NEXT:    s_and_b32 s8, s4, 0x7fffffff
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1150-NEXT:    s_cmp_ngt_f32 s3, s8
@@ -12906,232 +12906,221 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-LABEL: frem_v2f32:
 ; GFX1200:       ; %bb.0:
 ; GFX1200-NEXT:    s_clause 0x1
-; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1200-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
-; GFX1200-NEXT:    v_mov_b32_e32 v2, 0
+; GFX1200-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1200-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
+; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0
-; GFX1200-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
-; GFX1200-NEXT:    s_wait_loadcnt 0x0
-; GFX1200-NEXT:    v_readfirstlane_b32 s5, v1
-; GFX1200-NEXT:    global_load_b64 v[1:2], v2, s[6:7] offset:32
-; GFX1200-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1200-NEXT:    s_and_b32 s3, s6, 0x7fffffff
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    global_load_b64 v[2:3], v0, s[10:11]
+; GFX1200-NEXT:    global_load_b64 v[0:1], v0, s[0:1] offset:32
+; GFX1200-NEXT:    s_wait_loadcnt 0x1
+; GFX1200-NEXT:    v_and_b32_e32 v4, 0x7fffffff, v2
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
-; GFX1200-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX1200-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1200-NEXT:    s_and_b32 s8, s4, 0x7fffffff
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1200-NEXT:    s_cmp_ngt_f32 s3, s8
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB11_2
-; GFX1200-NEXT:  ; %bb.1: ; %frem.else16
-; GFX1200-NEXT:    s_cmp_eq_f32 s3, s8
-; GFX1200-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, s6
-; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1200-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v0
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
+; GFX1200-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v5
+; GFX1200-NEXT:    s_cbranch_vccz .LBB11_2
+; GFX1200-NEXT:  ; %bb.1: ; %frem.else16
+; GFX1200-NEXT:    v_bfi_b32 v6, 0x7fffffff, 0, v2
+; GFX1200-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v5
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-NEXT:    v_cndmask_b32_e32 v5, v2, v6, vcc_lo
 ; GFX1200-NEXT:    s_cbranch_execz .LBB11_3
 ; GFX1200-NEXT:    s_branch .LBB11_8
 ; GFX1200-NEXT:  .LBB11_2:
-; GFX1200-NEXT:    ; implicit-def: $vgpr0
+; GFX1200-NEXT:    ; implicit-def: $vgpr5
 ; GFX1200-NEXT:  .LBB11_3: ; %frem.compute15
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v1, |s4|
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v0, |s6|
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v3, s6
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v6, |v0|
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v5, |v2|
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v8, v2
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_ldexp_f32 v1, v1, 1
-; GFX1200-NEXT:    v_ldexp_f32 v2, v0, 12
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v0, s4
+; GFX1200-NEXT:    v_ldexp_f32 v6, v6, 1
+; GFX1200-NEXT:    v_ldexp_f32 v7, v5, 12
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v5, v0
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1200-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX1200-NEXT:    v_div_scale_f32 v5, null, v1, v1, 1.0
+; GFX1200-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX1200-NEXT:    v_div_scale_f32 v10, null, v6, v6, 1.0
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX1200-NEXT:    v_add_nc_u32_e32 v0, -1, v0
-; GFX1200-NEXT:    v_rcp_f32_e32 v6, v5
+; GFX1200-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX1200-NEXT:    v_add_nc_u32_e32 v5, -1, v5
+; GFX1200-NEXT:    v_rcp_f32_e32 v11, v10
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_not_b32_e32 v4, v0
-; GFX1200-NEXT:    v_add_nc_u32_e32 v4, v4, v3
-; GFX1200-NEXT:    v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0
+; GFX1200-NEXT:    v_not_b32_e32 v9, v5
+; GFX1200-NEXT:    v_add_nc_u32_e32 v9, v9, v8
+; GFX1200-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0
 ; GFX1200-NEXT:    s_denorm_mode 15
 ; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
-; GFX1200-NEXT:    v_fmac_f32_e32 v6, v7, v6
+; GFX1200-NEXT:    v_fma_f32 v12, -v10, v11, 1.0
+; GFX1200-NEXT:    v_fmac_f32_e32 v11, v12, v11
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_mul_f32_e32 v7, v3, v6
-; GFX1200-NEXT:    v_fma_f32 v8, -v5, v7, v3
+; GFX1200-NEXT:    v_mul_f32_e32 v12, v8, v11
+; GFX1200-NEXT:    v_fma_f32 v13, -v10, v12, v8
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_fmac_f32_e32 v7, v8, v6
-; GFX1200-NEXT:    v_fma_f32 v3, -v5, v7, v3
+; GFX1200-NEXT:    v_fmac_f32_e32 v12, v13, v11
+; GFX1200-NEXT:    v_fma_f32 v8, -v10, v12, v8
 ; GFX1200-NEXT:    s_denorm_mode 12
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
-; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v4
-; GFX1200-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
+; GFX1200-NEXT:    v_div_fmas_f32 v8, v8, v11, v12
+; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v9
+; GFX1200-NEXT:    v_div_fixup_f32 v8, v8, v6, 1.0
 ; GFX1200-NEXT:    s_cbranch_vccnz .LBB11_7
 ; GFX1200-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
-; GFX1200-NEXT:    s_sub_co_i32 s7, s7, s8
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_add_co_i32 s7, s7, 12
+; GFX1200-NEXT:    s_sub_co_i32 s0, s0, s1
+; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_add_co_i32 s0, s0, 12
 ; GFX1200-NEXT:  .LBB11_5: ; %frem.loop_body23
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_add_co_i32 s7, s7, -12
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_gt_i32 s7, 12
-; GFX1200-NEXT:    v_mul_f32_e32 v2, v5, v3
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v10, v7
+; GFX1200-NEXT:    s_add_co_i32 s0, s0, -12
+; GFX1200-NEXT:    s_cmp_gt_i32 s0, 12
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_rndne_f32_e32 v2, v2
-; GFX1200-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
+; GFX1200-NEXT:    v_mul_f32_e32 v7, v10, v8
+; GFX1200-NEXT:    v_rndne_f32_e32 v7, v7
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_fma_f32 v2, v2, v1, v5
-; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v2
-; GFX1200-NEXT:    v_add_f32_e32 v4, v2, v1
+; GFX1200-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
+; GFX1200-NEXT:    v_fma_f32 v7, v7, v6, v10
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v7
+; GFX1200-NEXT:    v_add_f32_e32 v9, v7, v6
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX1200-NEXT:    v_ldexp_f32 v2, v2, 12
+; GFX1200-NEXT:    v_cndmask_b32_e32 v7, v7, v9, vcc_lo
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_ldexp_f32 v7, v7, 12
 ; GFX1200-NEXT:    s_cbranch_scc1 .LBB11_5
 ; GFX1200-NEXT:  ; %bb.6: ; %Flow51
-; GFX1200-NEXT:    v_mov_b32_e32 v4, s7
-; GFX1200-NEXT:    v_mov_b32_e32 v2, v5
+; GFX1200-NEXT:    v_mov_b32_e32 v9, s0
+; GFX1200-NEXT:    v_mov_b32_e32 v7, v10
 ; GFX1200-NEXT:  .LBB11_7: ; %frem.loop_exit24
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_add_nc_u32_e32 v4, -11, v4
-; GFX1200-NEXT:    v_ldexp_f32 v2, v2, v4
+; GFX1200-NEXT:    v_add_nc_u32_e32 v9, -11, v9
+; GFX1200-NEXT:    v_ldexp_f32 v7, v7, v9
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX1200-NEXT:    v_rndne_f32_e32 v3, v3
+; GFX1200-NEXT:    v_mul_f32_e32 v8, v7, v8
+; GFX1200-NEXT:    v_rndne_f32_e32 v8, v8
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX1200-NEXT:    v_fmac_f32_e32 v2, v3, v1
+; GFX1200-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
+; GFX1200-NEXT:    v_fmac_f32_e32 v7, v8, v6
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v2
-; GFX1200-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v7
+; GFX1200-NEXT:    v_add_f32_e32 v6, v7, v6
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
-; GFX1200-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc_lo
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX1200-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, s6
+; GFX1200-NEXT:    v_ldexp_f32 v5, v6, v5
+; GFX1200-NEXT:    v_bfi_b32 v5, 0x7fffffff, v5, v2
 ; GFX1200-NEXT:  .LBB11_8:
-; GFX1200-NEXT:    s_and_b32 s6, s5, 0x7fffffff
-; GFX1200-NEXT:    s_and_b32 s8, s2, 0x7fffffff
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_ngt_f32 s6, s8
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB11_10
-; GFX1200-NEXT:  ; %bb.9: ; %frem.else
-; GFX1200-NEXT:    s_cmp_eq_f32 s6, s8
-; GFX1200-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, s5
-; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v3
+; GFX1200-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
+; GFX1200-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v2, v6
+; GFX1200-NEXT:    s_cbranch_vccz .LBB11_10
+; GFX1200-NEXT:  ; %bb.9: ; %frem.else
+; GFX1200-NEXT:    v_bfi_b32 v7, 0x7fffffff, 0, v3
+; GFX1200-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v2, v6
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-NEXT:    v_cndmask_b32_e32 v6, v3, v7, vcc_lo
 ; GFX1200-NEXT:    s_cbranch_execz .LBB11_11
 ; GFX1200-NEXT:    s_branch .LBB11_16
 ; GFX1200-NEXT:  .LBB11_10:
-; GFX1200-NEXT:    ; implicit-def: $vgpr1
+; GFX1200-NEXT:    ; implicit-def: $vgpr6
 ; GFX1200-NEXT:  .LBB11_11: ; %frem.compute
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v2, |s2|
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v1, |s5|
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v4, s5
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v7, |v1|
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v6, |v3|
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v9, v3
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_ldexp_f32 v2, v2, 1
-; GFX1200-NEXT:    v_ldexp_f32 v3, v1, 12
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v1, s2
+; GFX1200-NEXT:    v_ldexp_f32 v7, v7, 1
+; GFX1200-NEXT:    v_ldexp_f32 v8, v6, 12
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v6, v1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1200-NEXT:    v_readfirstlane_b32 s7, v4
-; GFX1200-NEXT:    v_div_scale_f32 v6, null, v2, v2, 1.0
+; GFX1200-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX1200-NEXT:    v_div_scale_f32 v11, null, v7, v7, 1.0
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX1200-NEXT:    v_add_nc_u32_e32 v1, -1, v1
-; GFX1200-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1200-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX1200-NEXT:    v_add_nc_u32_e32 v6, -1, v6
+; GFX1200-NEXT:    v_rcp_f32_e32 v12, v11
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_not_b32_e32 v5, v1
-; GFX1200-NEXT:    v_add_nc_u32_e32 v5, v5, v4
-; GFX1200-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
+; GFX1200-NEXT:    v_not_b32_e32 v10, v6
+; GFX1200-NEXT:    v_add_nc_u32_e32 v10, v10, v9
+; GFX1200-NEXT:    v_div_scale_f32 v9, vcc_lo, 1.0, v7, 1.0
 ; GFX1200-NEXT:    s_denorm_mode 15
 ; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
-; GFX1200-NEXT:    v_fmac_f32_e32 v7, v8, v7
+; GFX1200-NEXT:    v_fma_f32 v13, -v11, v12, 1.0
+; GFX1200-NEXT:    v_fmac_f32_e32 v12, v13, v12
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_mul_f32_e32 v8, v4, v7
-; GFX1200-NEXT:    v_fma_f32 v9, -v6, v8, v4
+; GFX1200-NEXT:    v_mul_f32_e32 v13, v9, v12
+; GFX1200-NEXT:    v_fma_f32 v14, -v11, v13, v9
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_fmac_f32_e32 v8, v9, v7
-; GFX1200-NEXT:    v_fma_f32 v4, -v6, v8, v4
+; GFX1200-NEXT:    v_fmac_f32_e32 v13, v14, v12
+; GFX1200-NEXT:    v_fma_f32 v9, -v11, v13, v9
 ; GFX1200-NEXT:    s_denorm_mode 12
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
-; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v5
-; GFX1200-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
+; GFX1200-NEXT:    v_div_fmas_f32 v9, v9, v12, v13
+; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v10
+; GFX1200-NEXT:    v_div_fixup_f32 v9, v9, v7, 1.0
 ; GFX1200-NEXT:    s_cbranch_vccnz .LBB11_15
 ; GFX1200-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
-; GFX1200-NEXT:    s_sub_co_i32 s7, s7, s8
+; GFX1200-NEXT:    s_sub_co_i32 s0, s0, s1
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_add_co_i32 s7, s7, 12
+; GFX1200-NEXT:    s_add_co_i32 s0, s0, 12
 ; GFX1200-NEXT:  .LBB11_13: ; %frem.loop_body
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_mov_b32_e32 v6, v3
+; GFX1200-NEXT:    v_mov_b32_e32 v11, v8
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_add_co_i32 s7, s7, -12
+; GFX1200-NEXT:    s_add_co_i32 s0, s0, -12
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_gt_i32 s7, 12
-; GFX1200-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX1200-NEXT:    s_cmp_gt_i32 s0, 12
+; GFX1200-NEXT:    v_mul_f32_e32 v8, v11, v9
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_rndne_f32_e32 v3, v3
-; GFX1200-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT:    v_rndne_f32_e32 v8, v8
+; GFX1200-NEXT:    v_xor_b32_e32 v8, 0x80000000, v8
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_fma_f32 v3, v3, v2, v6
-; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
-; GFX1200-NEXT:    v_add_f32_e32 v5, v3, v2
+; GFX1200-NEXT:    v_fma_f32 v8, v8, v7, v11
+; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v8
+; GFX1200-NEXT:    v_add_f32_e32 v10, v8, v7
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
-; GFX1200-NEXT:    v_ldexp_f32 v3, v3, 12
+; GFX1200-NEXT:    v_cndmask_b32_e32 v8, v8, v10, vcc_lo
+; GFX1200-NEXT:    v_ldexp_f32 v8, v8, 12
 ; GFX1200-NEXT:    s_cbranch_scc1 .LBB11_13
 ; GFX1200-NEXT:  ; %bb.14: ; %Flow
-; GFX1200-NEXT:    v_mov_b32_e32 v5, s7
-; GFX1200-NEXT:    v_mov_b32_e32 v3, v6
+; GFX1200-NEXT:    v_mov_b32_e32 v10, s0
+; GFX1200-NEXT:    v_mov_b32_e32 v8, v11
 ; GFX1200-NEXT:  .LBB11_15: ; %frem.loop_exit
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_add_nc_u32_e32 v5, -11, v5
-; GFX1200-NEXT:    v_ldexp_f32 v3, v3, v5
+; GFX1200-NEXT:    v_add_nc_u32_e32 v10, -11, v10
+; GFX1200-NEXT:    v_ldexp_f32 v8, v8, v10
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_mul_f32_e32 v4, v3, v4
-; GFX1200-NEXT:    v_rndne_f32_e32 v4, v4
+; GFX1200-NEXT:    v_mul_f32_e32 v9, v8, v9
+; GFX1200-NEXT:    v_rndne_f32_e32 v9, v9
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX1200-NEXT:    v_fmac_f32_e32 v3, v4, v2
+; GFX1200-NEXT:    v_xor_b32_e32 v9, 0x80000000, v9
+; GFX1200-NEXT:    v_fmac_f32_e32 v8, v9, v7
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
-; GFX1200-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v8
+; GFX1200-NEXT:    v_add_f32_e32 v7, v8, v7
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
-; GFX1200-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e32 v7, v8, v7, vcc_lo
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX1200-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, s5
+; GFX1200-NEXT:    v_ldexp_f32 v6, v7, v6
+; GFX1200-NEXT:    v_bfi_b32 v6, 0x7fffffff, v6, v3
 ; GFX1200-NEXT:  .LBB11_16: ; %Flow50
-; GFX1200-NEXT:    s_cmp_lg_f32 s4, 0
-; GFX1200-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1200-NEXT:    s_cmp_nge_f32 s3, 0x7f800000
-; GFX1200-NEXT:    s_cselect_b32 s3, -1, 0
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_and_b32 vcc_lo, s3, s4
-; GFX1200-NEXT:    s_cmp_lg_f32 s2, 0
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
-; GFX1200-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX1200-NEXT:    s_cmp_nge_f32 s6, 0x7f800000
-; GFX1200-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1200-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0, v0
+; GFX1200-NEXT:    v_cmp_nle_f32_e64 s0, 0x7f800000, v4
+; GFX1200-NEXT:    v_mov_b32_e32 v3, 0
+; GFX1200-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
+; GFX1200-NEXT:    v_cmp_nle_f32_e64 s0, 0x7f800000, v2
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_and_b32 vcc_lo, s3, s2
+; GFX1200-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v5, vcc_lo
+; GFX1200-NEXT:    v_cmp_lg_f32_e32 vcc_lo, 0, v1
+; GFX1200-NEXT:    s_and_b32 vcc_lo, s0, vcc_lo
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0x7fc00000, v1
-; GFX1200-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
+; GFX1200-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v6, vcc_lo
+; GFX1200-NEXT:    global_store_b64 v3, v[0:1], s[8:9]
 ; GFX1200-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <2 x float>, ptr addrspace(1) %in2, i32 4
@@ -15128,23 +15117,23 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1150-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
+; GFX1150-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX1150-NEXT:    v_readfirstlane_b32 s10, v1
 ; GFX1150-NEXT:    v_readfirstlane_b32 s9, v2
 ; GFX1150-NEXT:    v_readfirstlane_b32 s7, v3
-; GFX1150-NEXT:    global_load_b128 v[1:4], v4, s[4:5] offset:64
-; GFX1150-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX1150-NEXT:    s_and_b32 s5, s8, 0x7fffffff
+; GFX1150-NEXT:    global_load_b128 v[0:3], v4, s[4:5] offset:64
+; GFX1150-NEXT:    s_and_b32 s6, s8, 0x7fffffff
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_readfirstlane_b32 s6, v1
-; GFX1150-NEXT:    v_readfirstlane_b32 s4, v2
-; GFX1150-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX1150-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX1150-NEXT:    s_and_b32 s12, s6, 0x7fffffff
+; GFX1150-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1150-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1150-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX1150-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX1150-NEXT:    s_and_b32 s12, s5, 0x7fffffff
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1150-NEXT:    s_cmp_ngt_f32 s5, s12
+; GFX1150-NEXT:    s_cmp_ngt_f32 s6, s12
 ; GFX1150-NEXT:    s_cbranch_scc0 .LBB12_2
 ; GFX1150-NEXT:  ; %bb.1: ; %frem.else78
-; GFX1150-NEXT:    s_cmp_eq_f32 s5, s12
+; GFX1150-NEXT:    s_cmp_eq_f32 s6, s12
 ; GFX1150-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, s8
 ; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -15154,13 +15143,13 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:  .LBB12_2:
 ; GFX1150-NEXT:    ; implicit-def: $vgpr0
 ; GFX1150-NEXT:  .LBB12_3: ; %frem.compute77
-; GFX1150-NEXT:    v_frexp_mant_f32_e64 v1, |s6|
+; GFX1150-NEXT:    v_frexp_mant_f32_e64 v1, |s5|
 ; GFX1150-NEXT:    v_frexp_mant_f32_e64 v0, |s8|
 ; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v3, s8
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1150-NEXT:    v_ldexp_f32 v1, v1, 1
 ; GFX1150-NEXT:    v_ldexp_f32 v2, v0, 12
-; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v0, s6
+; GFX1150-NEXT:    v_frexp_exp_i32_f32_e32 v0, s5
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1150-NEXT:    v_readfirstlane_b32 s11, v3
 ; GFX1150-NEXT:    v_div_scale_f32 v5, null, v1, v1, 1.0
@@ -15514,13 +15503,13 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_ldexp_f32 v3, v4, v3
 ; GFX1150-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, s7
 ; GFX1150-NEXT:  .LBB12_32: ; %Flow116
-; GFX1150-NEXT:    s_cmp_lg_f32 s6, 0
+; GFX1150-NEXT:    s_cmp_lg_f32 s5, 0
 ; GFX1150-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1150-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1150-NEXT:    s_cmp_nge_f32 s5, 0x7f800000
 ; GFX1150-NEXT:    s_cselect_b32 s5, -1, 0
+; GFX1150-NEXT:    s_cmp_nge_f32 s6, 0x7f800000
+; GFX1150-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1150-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX1150-NEXT:    s_and_b32 vcc_lo, s6, s5
 ; GFX1150-NEXT:    s_cmp_lg_f32 s4, 0
 ; GFX1150-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
 ; GFX1150-NEXT:    s_cselect_b32 s4, -1, 0
@@ -15555,148 +15544,49 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0
 ; GFX1200-NEXT:    global_load_b128 v[0:3], v4, s[2:3]
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
-; GFX1200-NEXT:    v_readfirstlane_b32 s10, v1
-; GFX1200-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX1200-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX1200-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX1200-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX1200-NEXT:    v_readfirstlane_b32 s6, v3
 ; GFX1200-NEXT:    global_load_b128 v[1:4], v4, s[4:5] offset:64
-; GFX1200-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX1200-NEXT:    s_and_b32 s5, s8, 0x7fffffff
+; GFX1200-NEXT:    v_readfirstlane_b32 s7, v0
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
-; GFX1200-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX1200-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1200-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX1200-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v0
+; GFX1200-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v1
+; GFX1200-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX1200-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX1200-NEXT:    s_and_b32 s12, s6, 0x7fffffff
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1200-NEXT:    s_cmp_ngt_f32 s5, s12
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_2
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX1200-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v2, v3
+; GFX1200-NEXT:    s_cbranch_vccz .LBB12_2
 ; GFX1200-NEXT:  ; %bb.1: ; %frem.else78
-; GFX1200-NEXT:    s_cmp_eq_f32 s5, s12
-; GFX1200-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, s8
-; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v0, s8, v0, vcc_lo
+; GFX1200-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, s7
+; GFX1200-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v2, v3
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX1200-NEXT:    v_cndmask_b32_e32 v3, s7, v4, vcc_lo
 ; GFX1200-NEXT:    s_cbranch_execz .LBB12_3
 ; GFX1200-NEXT:    s_branch .LBB12_8
 ; GFX1200-NEXT:  .LBB12_2:
-; GFX1200-NEXT:    ; implicit-def: $vgpr0
+; GFX1200-NEXT:    ; implicit-def: $vgpr3
 ; GFX1200-NEXT:  .LBB12_3: ; %frem.compute77
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v1, |s6|
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v0, |s8|
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v3, s8
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v4, v0
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v0, |v0|
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v5, |v1|
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_ldexp_f32 v1, v1, 1
-; GFX1200-NEXT:    v_ldexp_f32 v2, v0, 12
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v0, s6
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1200-NEXT:    v_readfirstlane_b32 s11, v3
-; GFX1200-NEXT:    v_div_scale_f32 v5, null, v1, v1, 1.0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX1200-NEXT:    v_add_nc_u32_e32 v0, -1, v0
-; GFX1200-NEXT:    v_rcp_f32_e32 v6, v5
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_not_b32_e32 v4, v0
-; GFX1200-NEXT:    v_add_nc_u32_e32 v4, v4, v3
-; GFX1200-NEXT:    v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0
-; GFX1200-NEXT:    s_denorm_mode 15
-; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_fma_f32 v7, -v5, v6, 1.0
-; GFX1200-NEXT:    v_fmac_f32_e32 v6, v7, v6
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_mul_f32_e32 v7, v3, v6
-; GFX1200-NEXT:    v_fma_f32 v8, -v5, v7, v3
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_fmac_f32_e32 v7, v8, v6
-; GFX1200-NEXT:    v_fma_f32 v3, -v5, v7, v3
-; GFX1200-NEXT:    s_denorm_mode 12
-; GFX1200-NEXT:    s_wait_alu 0xfffd
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1200-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
-; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v4
-; GFX1200-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
-; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_7
-; GFX1200-NEXT:  ; %bb.4: ; %frem.loop_body85.preheader
-; GFX1200-NEXT:    s_sub_co_i32 s11, s11, s12
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1200-NEXT:    s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT:  .LBB12_5: ; %frem.loop_body85
-; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1200-NEXT:    v_mov_b32_e32 v5, v2
-; GFX1200-NEXT:    s_add_co_i32 s11, s11, -12
-; GFX1200-NEXT:    s_cmp_gt_i32 s11, 12
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_mul_f32_e32 v2, v5, v3
-; GFX1200-NEXT:    v_rndne_f32_e32 v2, v2
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_xor_b32_e32 v2, 0x80000000, v2
-; GFX1200-NEXT:    v_fma_f32 v2, v2, v1, v5
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v2
-; GFX1200-NEXT:    v_add_f32_e32 v4, v2, v1
-; GFX1200-NEXT:    s_wait_alu 0xfffd
-; GFX1200-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_ldexp_f32 v2, v2, 12
-; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_5
-; GFX1200-NEXT:  ; %bb.6: ; %Flow125
-; GFX1200-NEXT:    v_mov_b32_e32 v4, s11
-; GFX1200-NEXT:    v_mov_b32_e32 v2, v5
-; GFX1200-NEXT:  .LBB12_7: ; %frem.loop_exit86
+; GFX1200-NEXT:    v_readfirstlane_b32 s10, v4
+; GFX1200-NEXT:    v_ldexp_f32 v3, v0, 12
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v0, v1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_ldexp_f32 v1, v5, 1
+; GFX1200-NEXT:    v_readfirstlane_b32 s11, v0
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_add_nc_u32_e32 v4, -11, v4
-; GFX1200-NEXT:    v_ldexp_f32 v2, v2, v4
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_mul_f32_e32 v3, v2, v3
-; GFX1200-NEXT:    v_rndne_f32_e32 v3, v3
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
-; GFX1200-NEXT:    v_fmac_f32_e32 v2, v3, v1
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v2
-; GFX1200-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX1200-NEXT:    s_wait_alu 0xfffd
-; GFX1200-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc_lo
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_ldexp_f32 v0, v1, v0
-; GFX1200-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, s8
-; GFX1200-NEXT:  .LBB12_8:
-; GFX1200-NEXT:    s_and_b32 s8, s10, 0x7fffffff
-; GFX1200-NEXT:    s_and_b32 s12, s4, 0x7fffffff
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_ngt_f32 s8, s12
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_10
-; GFX1200-NEXT:  ; %bb.9: ; %frem.else47
-; GFX1200-NEXT:    s_cmp_eq_f32 s8, s12
-; GFX1200-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, s10
-; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v1, s10, v1, vcc_lo
-; GFX1200-NEXT:    s_cbranch_execz .LBB12_11
-; GFX1200-NEXT:    s_branch .LBB12_16
-; GFX1200-NEXT:  .LBB12_10:
-; GFX1200-NEXT:    ; implicit-def: $vgpr1
-; GFX1200-NEXT:  .LBB12_11: ; %frem.compute46
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v2, |s4|
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v1, |s10|
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v4, s10
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_ldexp_f32 v2, v2, 1
-; GFX1200-NEXT:    v_ldexp_f32 v3, v1, 12
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v1, s4
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1200-NEXT:    v_readfirstlane_b32 s11, v4
-; GFX1200-NEXT:    v_div_scale_f32 v6, null, v2, v2, 1.0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX1200-NEXT:    v_add_nc_u32_e32 v1, -1, v1
+; GFX1200-NEXT:    v_div_scale_f32 v6, null, v1, v1, 1.0
 ; GFX1200-NEXT:    v_rcp_f32_e32 v7, v6
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, -1, v0
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_not_b32_e32 v5, v1
+; GFX1200-NEXT:    v_not_b32_e32 v5, v0
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v5, v5, v4
-; GFX1200-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v2, 1.0
+; GFX1200-NEXT:    v_div_scale_f32 v4, vcc_lo, 1.0, v1, 1.0
 ; GFX1200-NEXT:    s_denorm_mode 15
 ; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_fma_f32 v8, -v6, v7, 1.0
@@ -15712,37 +15602,36 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1200-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
 ; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v5
-; GFX1200-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
-; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_15
-; GFX1200-NEXT:  ; %bb.12: ; %frem.loop_body54.preheader
-; GFX1200-NEXT:    s_sub_co_i32 s11, s11, s12
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT:  .LBB12_13: ; %frem.loop_body54
+; GFX1200-NEXT:    v_div_fixup_f32 v4, v4, v1, 1.0
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_7
+; GFX1200-NEXT:  ; %bb.4: ; %frem.loop_body85.preheader
+; GFX1200-NEXT:    s_sub_co_i32 s10, s10, s11
+; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1200-NEXT:    s_add_co_i32 s10, s10, 12
+; GFX1200-NEXT:  .LBB12_5: ; %frem.loop_body85
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1200-NEXT:    v_mov_b32_e32 v6, v3
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_add_co_i32 s11, s11, -12
-; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_gt_i32 s11, 12
-; GFX1200-NEXT:    v_mul_f32_e32 v3, v6, v4
+; GFX1200-NEXT:    s_add_co_i32 s10, s10, -12
+; GFX1200-NEXT:    s_cmp_gt_i32 s10, 12
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_f32_e32 v3, v6, v4
 ; GFX1200-NEXT:    v_rndne_f32_e32 v3, v3
-; GFX1200-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_fma_f32 v3, v3, v2, v6
+; GFX1200-NEXT:    v_xor_b32_e32 v3, 0x80000000, v3
+; GFX1200-NEXT:    v_fma_f32 v3, v3, v1, v6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
-; GFX1200-NEXT:    v_add_f32_e32 v5, v3, v2
+; GFX1200-NEXT:    v_add_f32_e32 v5, v3, v1
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_ldexp_f32 v3, v3, 12
-; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_13
-; GFX1200-NEXT:  ; %bb.14: ; %Flow121
-; GFX1200-NEXT:    v_mov_b32_e32 v5, s11
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_5
+; GFX1200-NEXT:  ; %bb.6: ; %Flow125
+; GFX1200-NEXT:    v_mov_b32_e32 v5, s10
 ; GFX1200-NEXT:    v_mov_b32_e32 v3, v6
-; GFX1200-NEXT:  .LBB12_15: ; %frem.loop_exit55
+; GFX1200-NEXT:  .LBB12_7: ; %frem.loop_exit86
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v5, -11, v5
 ; GFX1200-NEXT:    v_ldexp_f32 v3, v3, v5
@@ -15751,51 +15640,51 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
-; GFX1200-NEXT:    v_fmac_f32_e32 v3, v4, v2
+; GFX1200-NEXT:    v_fmac_f32_e32 v3, v4, v1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v3
-; GFX1200-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX1200-NEXT:    v_add_f32_e32 v1, v3, v1
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
-; GFX1200-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc_lo
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX1200-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, s10
-; GFX1200-NEXT:  .LBB12_16:
-; GFX1200-NEXT:    s_and_b32 s10, s9, 0x7fffffff
-; GFX1200-NEXT:    s_and_b32 s12, s3, 0x7fffffff
+; GFX1200-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1200-NEXT:    v_bfi_b32 v3, 0x7fffffff, v0, s7
+; GFX1200-NEXT:  .LBB12_8:
+; GFX1200-NEXT:    s_and_b32 s7, s9, 0x7fffffff
+; GFX1200-NEXT:    s_and_b32 s11, s4, 0x7fffffff
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_ngt_f32 s10, s12
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_18
-; GFX1200-NEXT:  ; %bb.17: ; %frem.else16
-; GFX1200-NEXT:    s_cmp_eq_f32 s10, s12
-; GFX1200-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, s9
+; GFX1200-NEXT:    s_cmp_ngt_f32 s7, s11
+; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_10
+; GFX1200-NEXT:  ; %bb.9: ; %frem.else47
+; GFX1200-NEXT:    s_cmp_eq_f32 s7, s11
+; GFX1200-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, s9
 ; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v2, s9, v2, vcc_lo
-; GFX1200-NEXT:    s_cbranch_execz .LBB12_19
-; GFX1200-NEXT:    s_branch .LBB12_24
-; GFX1200-NEXT:  .LBB12_18:
-; GFX1200-NEXT:    ; implicit-def: $vgpr2
-; GFX1200-NEXT:  .LBB12_19: ; %frem.compute15
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v3, |s3|
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v2, |s9|
+; GFX1200-NEXT:    v_cndmask_b32_e32 v0, s9, v0, vcc_lo
+; GFX1200-NEXT:    s_cbranch_execz .LBB12_11
+; GFX1200-NEXT:    s_branch .LBB12_16
+; GFX1200-NEXT:  .LBB12_10:
+; GFX1200-NEXT:    ; implicit-def: $vgpr0
+; GFX1200-NEXT:  .LBB12_11: ; %frem.compute46
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v1, |s4|
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v0, |s9|
 ; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v5, s9
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_ldexp_f32 v3, v3, 1
-; GFX1200-NEXT:    v_ldexp_f32 v4, v2, 12
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v2, s3
+; GFX1200-NEXT:    v_ldexp_f32 v1, v1, 1
+; GFX1200-NEXT:    v_ldexp_f32 v4, v0, 12
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v0, s4
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1200-NEXT:    v_readfirstlane_b32 s11, v5
-; GFX1200-NEXT:    v_div_scale_f32 v7, null, v3, v3, 1.0
+; GFX1200-NEXT:    v_readfirstlane_b32 s10, v5
+; GFX1200-NEXT:    v_div_scale_f32 v7, null, v1, v1, 1.0
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_readfirstlane_b32 s12, v2
-; GFX1200-NEXT:    v_add_nc_u32_e32 v2, -1, v2
+; GFX1200-NEXT:    v_readfirstlane_b32 s11, v0
+; GFX1200-NEXT:    v_add_nc_u32_e32 v0, -1, v0
 ; GFX1200-NEXT:    v_rcp_f32_e32 v8, v7
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_not_b32_e32 v6, v2
+; GFX1200-NEXT:    v_not_b32_e32 v6, v0
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v6, v6, v5
-; GFX1200-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v3, 1.0
+; GFX1200-NEXT:    v_div_scale_f32 v5, vcc_lo, 1.0, v1, 1.0
 ; GFX1200-NEXT:    s_denorm_mode 15
 ; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_fma_f32 v9, -v7, v8, 1.0
@@ -15811,37 +15700,37 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1200-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
 ; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v6
-; GFX1200-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
-; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_23
-; GFX1200-NEXT:  ; %bb.20: ; %frem.loop_body23.preheader
-; GFX1200-NEXT:    s_sub_co_i32 s11, s11, s12
+; GFX1200-NEXT:    v_div_fixup_f32 v5, v5, v1, 1.0
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_15
+; GFX1200-NEXT:  ; %bb.12: ; %frem.loop_body54.preheader
+; GFX1200-NEXT:    s_sub_co_i32 s10, s10, s11
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT:  .LBB12_21: ; %frem.loop_body23
+; GFX1200-NEXT:    s_add_co_i32 s10, s10, 12
+; GFX1200-NEXT:  .LBB12_13: ; %frem.loop_body54
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_add_co_i32 s11, s11, -12
+; GFX1200-NEXT:    s_add_co_i32 s10, s10, -12
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_gt_i32 s11, 12
+; GFX1200-NEXT:    s_cmp_gt_i32 s10, 12
 ; GFX1200-NEXT:    v_mul_f32_e32 v4, v7, v5
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_rndne_f32_e32 v4, v4
 ; GFX1200-NEXT:    v_xor_b32_e32 v4, 0x80000000, v4
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_fma_f32 v4, v4, v3, v7
+; GFX1200-NEXT:    v_fma_f32 v4, v4, v1, v7
 ; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
-; GFX1200-NEXT:    v_add_f32_e32 v6, v4, v3
+; GFX1200-NEXT:    v_add_f32_e32 v6, v4, v1
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX1200-NEXT:    v_ldexp_f32 v4, v4, 12
-; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_21
-; GFX1200-NEXT:  ; %bb.22: ; %Flow117
-; GFX1200-NEXT:    v_mov_b32_e32 v6, s11
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_13
+; GFX1200-NEXT:  ; %bb.14: ; %Flow121
+; GFX1200-NEXT:    v_mov_b32_e32 v6, s10
 ; GFX1200-NEXT:    v_mov_b32_e32 v4, v7
-; GFX1200-NEXT:  .LBB12_23: ; %frem.loop_exit24
+; GFX1200-NEXT:  .LBB12_15: ; %frem.loop_exit55
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v6, -11, v6
 ; GFX1200-NEXT:    v_ldexp_f32 v4, v4, v6
@@ -15850,49 +15739,49 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_rndne_f32_e32 v5, v5
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_xor_b32_e32 v5, 0x80000000, v5
-; GFX1200-NEXT:    v_fmac_f32_e32 v4, v5, v3
+; GFX1200-NEXT:    v_fmac_f32_e32 v4, v5, v1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v4
-; GFX1200-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX1200-NEXT:    v_add_f32_e32 v1, v4, v1
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
-; GFX1200-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc_lo
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_ldexp_f32 v2, v3, v2
-; GFX1200-NEXT:    v_bfi_b32 v2, 0x7fffffff, v2, s9
-; GFX1200-NEXT:  .LBB12_24:
-; GFX1200-NEXT:    s_and_b32 s9, s7, 0x7fffffff
-; GFX1200-NEXT:    s_and_b32 s12, s2, 0x7fffffff
+; GFX1200-NEXT:    v_ldexp_f32 v0, v1, v0
+; GFX1200-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, s9
+; GFX1200-NEXT:  .LBB12_16:
+; GFX1200-NEXT:    s_and_b32 s9, s8, 0x7fffffff
+; GFX1200-NEXT:    s_and_b32 s11, s3, 0x7fffffff
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_ngt_f32 s9, s12
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_26
-; GFX1200-NEXT:  ; %bb.25: ; %frem.else
-; GFX1200-NEXT:    s_cmp_eq_f32 s9, s12
-; GFX1200-NEXT:    v_bfi_b32 v3, 0x7fffffff, 0, s7
+; GFX1200-NEXT:    s_cmp_ngt_f32 s9, s11
+; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_18
+; GFX1200-NEXT:  ; %bb.17: ; %frem.else16
+; GFX1200-NEXT:    s_cmp_eq_f32 s9, s11
+; GFX1200-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, s8
 ; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v3, s7, v3, vcc_lo
-; GFX1200-NEXT:    s_cbranch_execz .LBB12_27
-; GFX1200-NEXT:    s_branch .LBB12_32
-; GFX1200-NEXT:  .LBB12_26:
-; GFX1200-NEXT:    ; implicit-def: $vgpr3
-; GFX1200-NEXT:  .LBB12_27: ; %frem.compute
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v4, |s2|
-; GFX1200-NEXT:    v_frexp_mant_f32_e64 v3, |s7|
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v6, s7
+; GFX1200-NEXT:    v_cndmask_b32_e32 v1, s8, v1, vcc_lo
+; GFX1200-NEXT:    s_cbranch_execz .LBB12_19
+; GFX1200-NEXT:    s_branch .LBB12_24
+; GFX1200-NEXT:  .LBB12_18:
+; GFX1200-NEXT:    ; implicit-def: $vgpr1
+; GFX1200-NEXT:  .LBB12_19: ; %frem.compute15
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v4, |s3|
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v1, |s8|
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v6, s8
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1200-NEXT:    v_ldexp_f32 v4, v4, 1
-; GFX1200-NEXT:    v_ldexp_f32 v5, v3, 12
-; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v3, s2
+; GFX1200-NEXT:    v_ldexp_f32 v5, v1, 12
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v1, s3
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1200-NEXT:    v_readfirstlane_b32 s11, v6
+; GFX1200-NEXT:    v_readfirstlane_b32 s10, v6
 ; GFX1200-NEXT:    v_div_scale_f32 v8, null, v4, v4, 1.0
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1200-NEXT:    v_readfirstlane_b32 s12, v3
-; GFX1200-NEXT:    v_add_nc_u32_e32 v3, -1, v3
+; GFX1200-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX1200-NEXT:    v_add_nc_u32_e32 v1, -1, v1
 ; GFX1200-NEXT:    v_rcp_f32_e32 v9, v8
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_not_b32_e32 v7, v3
+; GFX1200-NEXT:    v_not_b32_e32 v7, v1
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v7, v7, v6
 ; GFX1200-NEXT:    v_div_scale_f32 v6, vcc_lo, 1.0, v4, 1.0
 ; GFX1200-NEXT:    s_denorm_mode 15
@@ -15911,19 +15800,19 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
 ; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v7
 ; GFX1200-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
-; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_31
-; GFX1200-NEXT:  ; %bb.28: ; %frem.loop_body.preheader
-; GFX1200-NEXT:    s_sub_co_i32 s11, s11, s12
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_23
+; GFX1200-NEXT:  ; %bb.20: ; %frem.loop_body23.preheader
+; GFX1200-NEXT:    s_sub_co_i32 s10, s10, s11
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT:  .LBB12_29: ; %frem.loop_body
+; GFX1200-NEXT:    s_add_co_i32 s10, s10, 12
+; GFX1200-NEXT:  .LBB12_21: ; %frem.loop_body23
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_mov_b32_e32 v8, v5
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_add_co_i32 s11, s11, -12
+; GFX1200-NEXT:    s_add_co_i32 s10, s10, -12
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_cmp_gt_i32 s11, 12
+; GFX1200-NEXT:    s_cmp_gt_i32 s10, 12
 ; GFX1200-NEXT:    v_mul_f32_e32 v5, v8, v6
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_rndne_f32_e32 v5, v5
@@ -15936,11 +15825,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX1200-NEXT:    v_ldexp_f32 v5, v5, 12
-; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_29
-; GFX1200-NEXT:  ; %bb.30: ; %Flow
-; GFX1200-NEXT:    v_mov_b32_e32 v7, s11
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_21
+; GFX1200-NEXT:  ; %bb.22: ; %Flow117
+; GFX1200-NEXT:    v_mov_b32_e32 v7, s10
 ; GFX1200-NEXT:    v_mov_b32_e32 v5, v8
-; GFX1200-NEXT:  .LBB12_31: ; %frem.loop_exit
+; GFX1200-NEXT:  .LBB12_23: ; %frem.loop_exit24
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v7, -11, v7
 ; GFX1200-NEXT:    v_ldexp_f32 v5, v5, v7
@@ -15956,43 +15845,141 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_wait_alu 0xfffd
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-NEXT:    v_ldexp_f32 v3, v4, v3
-; GFX1200-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, s7
+; GFX1200-NEXT:    v_ldexp_f32 v1, v4, v1
+; GFX1200-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, s8
+; GFX1200-NEXT:  .LBB12_24:
+; GFX1200-NEXT:    s_and_b32 s8, s6, 0x7fffffff
+; GFX1200-NEXT:    s_and_b32 s11, s2, 0x7fffffff
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_cmp_ngt_f32 s8, s11
+; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_26
+; GFX1200-NEXT:  ; %bb.25: ; %frem.else
+; GFX1200-NEXT:    s_cmp_eq_f32 s8, s11
+; GFX1200-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, s6
+; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_cndmask_b32_e32 v4, s6, v4, vcc_lo
+; GFX1200-NEXT:    s_cbranch_execz .LBB12_27
+; GFX1200-NEXT:    s_branch .LBB12_32
+; GFX1200-NEXT:  .LBB12_26:
+; GFX1200-NEXT:    ; implicit-def: $vgpr4
+; GFX1200-NEXT:  .LBB12_27: ; %frem.compute
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v5, |s2|
+; GFX1200-NEXT:    v_frexp_mant_f32_e64 v4, |s6|
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v7, s6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1200-NEXT:    v_ldexp_f32 v5, v5, 1
+; GFX1200-NEXT:    v_ldexp_f32 v6, v4, 12
+; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v4, s2
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1200-NEXT:    v_readfirstlane_b32 s10, v7
+; GFX1200-NEXT:    v_div_scale_f32 v9, null, v5, v5, 1.0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1200-NEXT:    v_readfirstlane_b32 s11, v4
+; GFX1200-NEXT:    v_add_nc_u32_e32 v4, -1, v4
+; GFX1200-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_not_b32_e32 v8, v4
+; GFX1200-NEXT:    v_add_nc_u32_e32 v8, v8, v7
+; GFX1200-NEXT:    v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0
+; GFX1200-NEXT:    s_denorm_mode 15
+; GFX1200-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v11, -v9, v10, 1.0
+; GFX1200-NEXT:    v_fmac_f32_e32 v10, v11, v10
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_f32_e32 v11, v7, v10
+; GFX1200-NEXT:    v_fma_f32 v12, -v9, v11, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fmac_f32_e32 v11, v12, v10
+; GFX1200-NEXT:    v_fma_f32 v7, -v9, v11, v7
+; GFX1200-NEXT:    s_denorm_mode 12
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1200-NEXT:    v_div_fmas_f32 v7, v7, v10, v11
+; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v8
+; GFX1200-NEXT:    v_div_fixup_f32 v7, v7, v5, 1.0
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_31
+; GFX1200-NEXT:  ; %bb.28: ; %frem.loop_body.preheader
+; GFX1200-NEXT:    s_sub_co_i32 s10, s10, s11
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_add_co_i32 s10, s10, 12
+; GFX1200-NEXT:  .LBB12_29: ; %frem.loop_body
+; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v9, v6
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_add_co_i32 s10, s10, -12
+; GFX1200-NEXT:    s_wait_alu 0xfffe
+; GFX1200-NEXT:    s_cmp_gt_i32 s10, 12
+; GFX1200-NEXT:    v_mul_f32_e32 v6, v9, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_rndne_f32_e32 v6, v6
+; GFX1200-NEXT:    v_xor_b32_e32 v6, 0x80000000, v6
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_fma_f32 v6, v6, v5, v9
+; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX1200-NEXT:    v_add_f32_e32 v8, v6, v5
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc_lo
+; GFX1200-NEXT:    v_ldexp_f32 v6, v6, 12
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_29
+; GFX1200-NEXT:  ; %bb.30: ; %Flow
+; GFX1200-NEXT:    v_mov_b32_e32 v8, s10
+; GFX1200-NEXT:    v_mov_b32_e32 v6, v9
+; GFX1200-NEXT:  .LBB12_31: ; %frem.loop_exit
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_add_nc_u32_e32 v8, -11, v8
+; GFX1200-NEXT:    v_ldexp_f32 v6, v6, v8
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX1200-NEXT:    v_rndne_f32_e32 v7, v7
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_xor_b32_e32 v7, 0x80000000, v7
+; GFX1200-NEXT:    v_fmac_f32_e32 v6, v7, v5
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_cmp_gt_f32_e32 vcc_lo, 0, v6
+; GFX1200-NEXT:    v_add_f32_e32 v5, v6, v5
+; GFX1200-NEXT:    s_wait_alu 0xfffd
+; GFX1200-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_ldexp_f32 v4, v5, v4
+; GFX1200-NEXT:    v_bfi_b32 v4, 0x7fffffff, v4, s6
 ; GFX1200-NEXT:  .LBB12_32: ; %Flow116
-; GFX1200-NEXT:    s_cmp_lg_f32 s6, 0
-; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
-; GFX1200-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1200-NEXT:    s_cmp_nge_f32 s5, 0x7f800000
+; GFX1200-NEXT:    s_cmp_lg_f32 s5, 0
+; GFX1200-NEXT:    v_cmp_nle_f32_e32 vcc_lo, 0x7f800000, v2
 ; GFX1200-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    s_and_b32 vcc_lo, s5, s6
+; GFX1200-NEXT:    s_and_b32 vcc_lo, vcc_lo, s5
 ; GFX1200-NEXT:    s_cmp_lg_f32 s4, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    v_cndmask_b32_e32 v0, 0x7fc00000, v0, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e32 v5, 0x7fc00000, v3, vcc_lo
 ; GFX1200-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1200-NEXT:    s_cmp_nge_f32 s8, 0x7f800000
+; GFX1200-NEXT:    s_cmp_nge_f32 s7, 0x7f800000
 ; GFX1200-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_b32 vcc_lo, s5, s4
 ; GFX1200-NEXT:    s_cmp_lg_f32 s3, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    v_cndmask_b32_e32 v1, 0x7fc00000, v1, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e32 v6, 0x7fc00000, v0, vcc_lo
+; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1200-NEXT:    s_cselect_b32 s3, -1, 0
-; GFX1200-NEXT:    s_cmp_nge_f32 s10, 0x7f800000
+; GFX1200-NEXT:    s_cmp_nge_f32 s9, 0x7f800000
 ; GFX1200-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_b32 vcc_lo, s4, s3
 ; GFX1200-NEXT:    s_cmp_lg_f32 s2, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    v_cndmask_b32_e32 v2, 0x7fc00000, v2, vcc_lo
+; GFX1200-NEXT:    v_cndmask_b32_e32 v7, 0x7fc00000, v1, vcc_lo
 ; GFX1200-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX1200-NEXT:    s_cmp_nge_f32 s9, 0x7f800000
+; GFX1200-NEXT:    s_cmp_nge_f32 s8, 0x7f800000
 ; GFX1200-NEXT:    s_cselect_b32 s3, -1, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
 ; GFX1200-NEXT:    s_and_b32 vcc_lo, s3, s2
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    v_cndmask_b32_e32 v3, 0x7fc00000, v3, vcc_lo
-; GFX1200-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
+; GFX1200-NEXT:    v_cndmask_b32_e32 v8, 0x7fc00000, v4, vcc_lo
+; GFX1200-NEXT:    global_store_b128 v0, v[5:8], s[0:1]
 ; GFX1200-NEXT:    s_endpgm
                         ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr <4 x float>, ptr addrspace(1) %in2, i32 4
@@ -17576,14 +17563,14 @@ define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
 ; SI-NEXT:    s_and_b64 s[2:3], vcc, exec
-; SI-NEXT:    s_cselect_b32 s8, 0x7ff80000, 0
-; SI-NEXT:    s_mov_b32 s2, s6
-; SI-NEXT:    s_mov_b32 s3, s7
+; SI-NEXT:    s_cselect_b32 s6, 0x7ff80000, 0
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
 ; SI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; SI-NEXT:    s_cselect_b32 s4, 0x7ff80000, 0
 ; SI-NEXT:    v_mov_b32_e32 v0, 0
-; SI-NEXT:    v_mov_b32_e32 v1, s8
+; SI-NEXT:    v_mov_b32_e32 v1, s6
 ; SI-NEXT:    v_mov_b32_e32 v3, s4
 ; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
@@ -17603,10 +17590,10 @@ define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr
 ; CI-NEXT:    v_mov_b32_e32 v0, 0
 ; CI-NEXT:    s_and_b64 s[2:3], vcc, exec
 ; CI-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
-; CI-NEXT:    s_cselect_b32 s8, 0x7ff80000, 0
-; CI-NEXT:    s_mov_b32 s2, s6
-; CI-NEXT:    s_mov_b32 s3, s7
-; CI-NEXT:    v_mov_b32_e32 v1, s8
+; CI-NEXT:    s_cselect_b32 s6, 0x7ff80000, 0
+; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
+; CI-NEXT:    v_mov_b32_e32 v1, s6
 ; CI-NEXT:    v_mov_b32_e32 v2, v0
 ; CI-NEXT:    s_and_b64 s[4:5], vcc, exec
 ; CI-NEXT:    s_cselect_b32 s4, 0x7ff80000, 0
@@ -17642,14 +17629,15 @@ define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3]
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
-; GFX9-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[3:4]
+; GFX9-NEXT:    v_cmp_nlg_f64_e32 vcc, 0, v[2:3]
 ; GFX9-NEXT:    s_cselect_b32 s4, 0x7ff80000, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
 ; GFX9-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
@@ -17661,13 +17649,14 @@ define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr
 ; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    global_load_dwordx4 v[1:4], v0, s[2:3]
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[0:1]
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
-; GFX10-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX10-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[2:3]
 ; GFX10-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX10-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
 ; GFX10-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
@@ -17680,17 +17669,19 @@ define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    global_load_b128 v[1:4], v0, s[2:3]
+; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX11-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[0:1]
 ; GFX11-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
-; GFX11-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX11-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[2:3]
 ; GFX11-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX11-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
-; GFX11-NEXT:    v_mov_b32_e32 v3, s3
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX11-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
 ; GFX11-NEXT:    s_endpgm
 ;
@@ -17699,17 +17690,19 @@ define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr
 ; GFX1150-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1150-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1150-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX1150-NEXT:    global_load_b128 v[1:4], v0, s[2:3]
+; GFX1150-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
-; GFX1150-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX1150-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[0:1]
 ; GFX1150-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
-; GFX1150-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX1150-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[2:3]
 ; GFX1150-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1150-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX1150-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1150-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
 ; GFX1150-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
-; GFX1150-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1150-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1150-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1150-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
 ; GFX1150-NEXT:    s_endpgm
 ;
@@ -17718,18 +17711,19 @@ define amdgpu_kernel void @frem_v2f64_const_zero_num(ptr addrspace(1) %out, ptr
 ; GFX1200-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1200-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1200-NEXT:    s_wait_kmcnt 0x0
-; GFX1200-NEXT:    global_load_b128 v[1:4], v0, s[2:3]
+; GFX1200-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
-; GFX1200-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[1:2]
+; GFX1200-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[0:1]
 ; GFX1200-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
-; GFX1200-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[3:4]
+; GFX1200-NEXT:    v_cmp_nlg_f64_e32 vcc_lo, 0, v[2:3]
 ; GFX1200-NEXT:    s_cselect_b32 s2, 0x7ff80000, 0
-; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1200-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, v0
+; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX1200-NEXT:    s_and_b32 s3, vcc_lo, exec_lo
 ; GFX1200-NEXT:    s_cselect_b32 s3, 0x7ff80000, 0
 ; GFX1200-NEXT:    s_wait_alu 0xfffe
-; GFX1200-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1200-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s3
+; GFX1200-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX1200-NEXT:    global_store_b128 v0, v[0:3], s[0:1]
 ; GFX1200-NEXT:    s_endpgm
    %r0 = load <2 x double>, ptr addrspace(1) %in, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll b/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll
index 670e2c5b2c9e0..91584ca9dbd96 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-vector-sink.ll
@@ -12,7 +12,8 @@ define amdgpu_kernel void @runningSum(ptr addrspace(1) %out0, ptr addrspace(1) %
 ; OPT:       [[LOOPBODY]]:
 ; OPT-NEXT:    [[PREVIOUSSUM:%.*]] = phi <2 x i32> [ [[TMP1]], %[[PREHEADER]] ], [ [[RUNNINGSUM:%.*]], %[[LOOPBODY]] ]
 ; OPT-NEXT:    [[ITERCOUNT:%.*]] = phi i32 [ [[INPUTITER]], %[[PREHEADER]] ], [ [[ITERSLEFT:%.*]], %[[LOOPBODY]] ]
-; OPT-NEXT:    [[RUNNINGSUM]] = add <2 x i32> [[TMP1]], [[PREVIOUSSUM]]
+; OPT-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[VECELEMENT1]], <2 x i32> poison, <2 x i32> zeroinitializer
+; OPT-NEXT:    [[RUNNINGSUM]] = add <2 x i32> [[TMP0]], [[PREVIOUSSUM]]
 ; OPT-NEXT:    [[ITERSLEFT]] = sub i32 [[ITERCOUNT]], 1
 ; OPT-NEXT:    [[COND:%.*]] = icmp eq i32 [[ITERSLEFT]], 0
 ; OPT-NEXT:    br i1 [[COND]], label %[[LOOPEXIT:.*]], label %[[LOOPBODY]]
@@ -83,12 +84,12 @@ define amdgpu_kernel void @test_sink_extract_operands(ptr addrspace(1) %out0, pt
 ; OPT-SAME: ptr addrspace(1) [[OUT0:%.*]], ptr addrspace(1) [[OUT1:%.*]], <4 x i32> [[INPUT_VEC:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:  [[ENTRY:.*:]]
 ; OPT-NEXT:    [[VEC_FULL:%.*]] = add <4 x i32> [[INPUT_VEC]], <i32 42, i32 43, i32 44, i32 45>
-; OPT-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[VEC_FULL]], i64 0
-; OPT-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_FULL]], i64 1
 ; OPT-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]]
 ; OPT-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; OPT:       [[IF_THEN]]:
+; OPT-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[VEC_FULL]], i64 0
 ; OPT-NEXT:    [[RESULT0:%.*]] = add i32 [[TMP0]], 100
+; OPT-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[VEC_FULL]], i64 1
 ; OPT-NEXT:    [[RESULT1:%.*]] = add i32 [[TMP1]], 200
 ; OPT-NEXT:    store i32 [[RESULT0]], ptr addrspace(1) [[OUT0]], align 4
 ; OPT-NEXT:    store i32 [[RESULT1]], ptr addrspace(1) [[OUT1]], align 4
@@ -121,14 +122,14 @@ define amdgpu_kernel void @test_shuffle_insert_subvector(ptr addrspace(1) %ptr,
 ; OPT-NEXT:  [[ENTRY:.*:]]
 ; OPT-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[VEC1]], <4 x i16> [[VEC2]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
 ; OPT-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i16> [[VEC1]], <4 x i16> [[VEC2]], <4 x i32> <i32 2, i32 3, i32 6, i32 7>
-; OPT-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <4 x i16> [[VEC1]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; OPT-NEXT:    [[SHUFFLE4:%.*]] = shufflevector <4 x i16> [[VEC2]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; OPT-NEXT:    [[SHUFFLE5:%.*]] = shufflevector <4 x i16> [[SHUFFLE]], <4 x i16> [[SHUFFLE2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; OPT-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]]
 ; OPT-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; OPT:       [[IF_THEN]]:
 ; OPT-NEXT:    [[RESULT_VEC:%.*]] = add <4 x i16> [[SHUFFLE5]], <i16 100, i16 200, i16 300, i16 400>
+; OPT-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <4 x i16> [[VEC1]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; OPT-NEXT:    [[OTHER_RESULT:%.*]] = mul <4 x i16> [[SHUFFLE3]], splat (i16 2)
+; OPT-NEXT:    [[SHUFFLE4:%.*]] = shufflevector <4 x i16> [[VEC2]], <4 x i16> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; OPT-NEXT:    [[MORE_RESULT:%.*]] = sub <4 x i16> [[SHUFFLE4]], splat (i16 5)
 ; OPT-NEXT:    store <4 x i16> [[RESULT_VEC]], ptr addrspace(1) [[PTR]], align 8
 ; OPT-NEXT:    store <4 x i16> [[OTHER_RESULT]], ptr addrspace(1) [[PTR]], align 8
@@ -164,14 +165,14 @@ define amdgpu_kernel void @test_shuffle_extract_subvector(ptr addrspace(1) %ptr,
 ; OPT-LABEL: define amdgpu_kernel void @test_shuffle_extract_subvector(
 ; OPT-SAME: ptr addrspace(1) [[PTR:%.*]], <4 x i16> [[INPUT_VEC:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:  [[ENTRY:.*:]]
-; OPT-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
-; OPT-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
-; OPT-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; OPT-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]]
 ; OPT-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; OPT:       [[IF_THEN]]:
+; OPT-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
 ; OPT-NEXT:    [[RESULT_VEC:%.*]] = add <2 x i16> [[SHUFFLE]], <i16 100, i16 200>
+; OPT-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
 ; OPT-NEXT:    [[RESULT_VEC2:%.*]] = mul <2 x i16> [[SHUFFLE2]], splat (i16 3)
+; OPT-NEXT:    [[SHUFFLE3:%.*]] = shufflevector <4 x i16> [[INPUT_VEC]], <4 x i16> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; OPT-NEXT:    [[RESULT_VEC3:%.*]] = sub <4 x i16> [[SHUFFLE3]], splat (i16 10)
 ; OPT-NEXT:    store <2 x i16> [[RESULT_VEC]], ptr addrspace(1) [[PTR]], align 4
 ; OPT-NEXT:    store <2 x i16> [[RESULT_VEC2]], ptr addrspace(1) [[PTR]], align 4
@@ -205,12 +206,12 @@ define amdgpu_kernel void @test_shuffle_sink_operands(ptr addrspace(1) %ptr, <2
 ; OPT-LABEL: define amdgpu_kernel void @test_shuffle_sink_operands(
 ; OPT-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x i16> [[INPUT_VEC:%.*]], <2 x i16> [[INPUT_VEC2:%.*]], i32 [[TID:%.*]], i32 [[COND:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:  [[ENTRY:.*:]]
-; OPT-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[INPUT_VEC]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; OPT-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i16> [[INPUT_VEC2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; OPT-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TID]], [[COND]]
 ; OPT-NEXT:    br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]]
 ; OPT:       [[IF_THEN]]:
+; OPT-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[INPUT_VEC]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; OPT-NEXT:    [[RESULT_VEC:%.*]] = add <4 x i16> [[SHUFFLE]], <i16 100, i16 200, i16 300, i16 400>
+; OPT-NEXT:    [[SHUFFLE2:%.*]] = shufflevector <2 x i16> [[INPUT_VEC2]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; OPT-NEXT:    [[RESULT_VEC2:%.*]] = mul <4 x i16> [[SHUFFLE2]], splat (i16 5)
 ; OPT-NEXT:    store <4 x i16> [[RESULT_VEC]], ptr addrspace(1) [[PTR]], align 8
 ; OPT-NEXT:    store <4 x i16> [[RESULT_VEC2]], ptr addrspace(1) [[PTR]], align 8
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index e12e31b14e97d..35a93a7ef1447 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -3328,177 +3328,152 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-LABEL: srem_v2i64:
 ; TONGA:       ; %bb.0:
 ; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; TONGA-NEXT:    v_mov_b32_e32 v8, 0
 ; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
 ; TONGA-NEXT:    s_add_u32 s0, s6, 16
+; TONGA-NEXT:    v_mov_b32_e32 v4, s6
 ; TONGA-NEXT:    s_addc_u32 s1, s7, 0
 ; TONGA-NEXT:    v_mov_b32_e32 v0, s0
-; TONGA-NEXT:    v_mov_b32_e32 v4, s6
-; TONGA-NEXT:    v_mov_b32_e32 v1, s1
 ; TONGA-NEXT:    v_mov_b32_e32 v5, s7
+; TONGA-NEXT:    v_mov_b32_e32 v1, s1
 ; TONGA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; TONGA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; TONGA-NEXT:    s_waitcnt vmcnt(1)
-; TONGA-NEXT:    v_readfirstlane_b32 s1, v1
-; TONGA-NEXT:    v_readfirstlane_b32 s0, v0
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
-; TONGA-NEXT:    v_readfirstlane_b32 s3, v5
-; TONGA-NEXT:    v_readfirstlane_b32 s2, v4
-; TONGA-NEXT:    s_or_b64 s[6:7], s[2:3], s[0:1]
-; TONGA-NEXT:    s_mov_b32 s6, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; TONGA-NEXT:    s_cbranch_scc0 .LBB10_3
+; TONGA-NEXT:    v_or_b32_e32 v9, v5, v1
+; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; TONGA-NEXT:    s_cbranch_vccz .LBB10_7
 ; TONGA-NEXT:  ; %bb.1:
-; TONGA-NEXT:    s_ashr_i32 s6, s1, 31
-; TONGA-NEXT:    s_add_u32 s8, s0, s6
-; TONGA-NEXT:    s_mov_b32 s7, s6
-; TONGA-NEXT:    s_addc_u32 s9, s1, s6
-; TONGA-NEXT:    s_xor_b64 s[6:7], s[8:9], s[6:7]
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, s6
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; TONGA-NEXT:    s_sub_u32 s1, 0, s6
-; TONGA-NEXT:    s_subb_u32 s10, 0, s7
-; TONGA-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
-; TONGA-NEXT:    v_rcp_f32_e32 v0, v0
-; TONGA-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; TONGA-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; TONGA-NEXT:    v_trunc_f32_e32 v1, v1
-; TONGA-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v1
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v9, v0
-; TONGA-NEXT:    v_mul_lo_u32 v4, s1, v8
-; TONGA-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s1, v9, 0
-; TONGA-NEXT:    v_mul_lo_u32 v5, s10, v9
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
-; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v1, v5
-; TONGA-NEXT:    v_mul_hi_u32 v10, v9, v0
-; TONGA-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v9, v11, 0
-; TONGA-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v8, v0, 0
-; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v10, v4
-; TONGA-NEXT:    v_addc_u32_e32 v12, vcc, 0, v5, vcc
-; TONGA-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v8, v11, 0
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v10, v0
-; TONGA-NEXT:    v_addc_u32_e32 v0, vcc, v12, v1, vcc
-; TONGA-NEXT:    v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
-; TONGA-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v9, v0
-; TONGA-NEXT:    v_addc_u32_e32 v11, vcc, v8, v1, vcc
-; TONGA-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s1, v10, 0
-; TONGA-NEXT:    v_mul_lo_u32 v8, s1, v11
-; TONGA-NEXT:    v_mul_lo_u32 v9, s10, v10
-; TONGA-NEXT:    v_mul_hi_u32 v12, v10, v0
-; TONGA-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v11, v0, 0
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v8, v1
-; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v9, v1
-; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[8:9], v10, v1, 0
-; TONGA-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v11, v1, 0
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v12, v8
+; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v1
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v0, v8
+; TONGA-NEXT:    v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v14, v9, v8
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v8
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, v14
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, v1
+; TONGA-NEXT:    v_sub_u32_e32 v15, vcc, 0, v14
+; TONGA-NEXT:    v_subb_u32_e32 v16, vcc, 0, v1, vcc
+; TONGA-NEXT:    v_madmk_f32 v8, v9, 0x4f800000, v8
+; TONGA-NEXT:    v_rcp_f32_e32 v8, v8
+; TONGA-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
+; TONGA-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
+; TONGA-NEXT:    v_trunc_f32_e32 v9, v9
+; TONGA-NEXT:    v_madmk_f32 v8, v9, 0xcf800000, v8
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v12, v9
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v13, v8
+; TONGA-NEXT:    v_mul_lo_u32 v10, v15, v12
+; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v15, v13, 0
+; TONGA-NEXT:    v_mul_lo_u32 v11, v16, v13
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v9, v10
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v9, v11
+; TONGA-NEXT:    v_mad_u64_u32 v[9:10], s[0:1], v13, v11, 0
+; TONGA-NEXT:    v_mul_hi_u32 v17, v13, v8
+; TONGA-NEXT:    v_add_u32_e32 v17, vcc, v17, v9
+; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v12, v8, 0
+; TONGA-NEXT:    v_addc_u32_e32 v18, vcc, 0, v10, vcc
+; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v12, v11, 0
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v17, v8
+; TONGA-NEXT:    v_addc_u32_e32 v8, vcc, v18, v9, vcc
+; TONGA-NEXT:    v_addc_u32_e32 v9, vcc, 0, v11, vcc
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
 ; TONGA-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v8, v4
-; TONGA-NEXT:    v_addc_u32_e32 v4, vcc, v9, v5, vcc
-; TONGA-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v4, v0
-; TONGA-NEXT:    s_ashr_i32 s10, s3, 31
-; TONGA-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; TONGA-NEXT:    s_add_u32 s8, s2, s10
-; TONGA-NEXT:    v_add_u32_e32 v4, vcc, v10, v0
-; TONGA-NEXT:    s_mov_b32 s11, s10
-; TONGA-NEXT:    s_addc_u32 s9, s3, s10
-; TONGA-NEXT:    v_addc_u32_e32 v5, vcc, v11, v1, vcc
-; TONGA-NEXT:    s_xor_b64 s[12:13], s[8:9], s[10:11]
-; TONGA-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s12, v5, 0
-; TONGA-NEXT:    v_mul_hi_u32 v8, s12, v4
-; TONGA-NEXT:    v_readfirstlane_b32 s1, v1
-; TONGA-NEXT:    v_readfirstlane_b32 s3, v0
-; TONGA-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s13, v5, 0
-; TONGA-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], s13, v4, 0
-; TONGA-NEXT:    v_readfirstlane_b32 s14, v8
-; TONGA-NEXT:    s_add_u32 s3, s14, s3
-; TONGA-NEXT:    s_addc_u32 s1, 0, s1
-; TONGA-NEXT:    v_readfirstlane_b32 s14, v4
-; TONGA-NEXT:    v_readfirstlane_b32 s9, v5
-; TONGA-NEXT:    s_add_u32 s3, s3, s14
-; TONGA-NEXT:    v_readfirstlane_b32 s8, v1
-; TONGA-NEXT:    s_addc_u32 s1, s1, s9
-; TONGA-NEXT:    s_addc_u32 s3, s8, 0
-; TONGA-NEXT:    v_readfirstlane_b32 s8, v0
-; TONGA-NEXT:    s_add_u32 s1, s1, s8
-; TONGA-NEXT:    v_mov_b32_e32 v0, s1
-; TONGA-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], s6, v0, 0
-; TONGA-NEXT:    s_addc_u32 s3, 0, s3
-; TONGA-NEXT:    s_mul_i32 s3, s6, s3
-; TONGA-NEXT:    v_readfirstlane_b32 s14, v1
-; TONGA-NEXT:    s_add_i32 s3, s14, s3
-; TONGA-NEXT:    s_mul_i32 s1, s7, s1
-; TONGA-NEXT:    s_add_i32 s3, s3, s1
-; TONGA-NEXT:    s_sub_i32 s1, s13, s3
-; TONGA-NEXT:    v_readfirstlane_b32 s14, v0
-; TONGA-NEXT:    s_sub_u32 s12, s12, s14
-; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s18, s12, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_subb_u32 s19, s1, 0
-; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
-; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
-; TONGA-NEXT:    s_cmp_ge_u32 s18, s6
-; TONGA-NEXT:    s_cselect_b32 s21, -1, 0
-; TONGA-NEXT:    s_cmp_eq_u32 s19, s7
-; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s16, s18, s6
-; TONGA-NEXT:    s_subb_u32 s1, s1, 0
-; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
-; TONGA-NEXT:    s_cselect_b32 s1, s1, s19
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; TONGA-NEXT:    s_subb_u32 s3, s13, s3
-; TONGA-NEXT:    s_cmp_ge_u32 s3, s7
-; TONGA-NEXT:    s_cselect_b32 s13, -1, 0
-; TONGA-NEXT:    s_cmp_ge_u32 s12, s6
-; TONGA-NEXT:    s_cselect_b32 s6, -1, 0
-; TONGA-NEXT:    s_cmp_eq_u32 s3, s7
-; TONGA-NEXT:    s_cselect_b32 s6, s6, s13
-; TONGA-NEXT:    s_cmp_lg_u32 s6, 0
-; TONGA-NEXT:    s_cselect_b32 s7, s1, s3
-; TONGA-NEXT:    s_cselect_b32 s6, s16, s12
-; TONGA-NEXT:    s_xor_b64 s[6:7], s[6:7], s[10:11]
-; TONGA-NEXT:    s_sub_u32 s6, s6, s10
-; TONGA-NEXT:    s_subb_u32 s7, s7, s10
-; TONGA-NEXT:    s_cbranch_execnz .LBB10_4
+; TONGA-NEXT:    v_add_u32_e32 v17, vcc, v13, v8
+; TONGA-NEXT:    v_addc_u32_e32 v18, vcc, v12, v9, vcc
+; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v15, v17, 0
+; TONGA-NEXT:    v_mul_lo_u32 v12, v15, v18
+; TONGA-NEXT:    v_mul_lo_u32 v13, v16, v17
+; TONGA-NEXT:    v_mul_hi_u32 v15, v17, v8
+; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v18, v8, 0
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v12, v9
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v13, v9
+; TONGA-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v17, v9, 0
+; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v18, v9, 0
+; TONGA-NEXT:    v_add_u32_e32 v12, vcc, v15, v12
+; TONGA-NEXT:    v_addc_u32_e32 v13, vcc, 0, v13, vcc
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v12, v10
+; TONGA-NEXT:    v_addc_u32_e32 v10, vcc, v13, v11, vcc
+; TONGA-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
+; TONGA-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v17, v8
+; TONGA-NEXT:    v_addc_u32_e32 v11, vcc, v18, v9, vcc
+; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v5
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v4, v12
+; TONGA-NEXT:    v_xor_b32_e32 v13, v8, v12
+; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v13, v11, 0
+; TONGA-NEXT:    v_mul_hi_u32 v15, v13, v10
+; TONGA-NEXT:    v_addc_u32_e32 v5, vcc, v5, v12, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v12
+; TONGA-NEXT:    v_add_u32_e32 v15, vcc, v15, v8
+; TONGA-NEXT:    v_addc_u32_e32 v16, vcc, 0, v9, vcc
+; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v5, v10, 0
+; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v5, v11, 0
+; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v15, v8
+; TONGA-NEXT:    v_addc_u32_e32 v8, vcc, v16, v9, vcc
+; TONGA-NEXT:    v_addc_u32_e32 v9, vcc, 0, v11, vcc
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v8, v10
+; TONGA-NEXT:    v_addc_u32_e32 v8, vcc, 0, v9, vcc
+; TONGA-NEXT:    v_mul_lo_u32 v11, v14, v8
+; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v10, 0
+; TONGA-NEXT:    v_mul_lo_u32 v10, v1, v10
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v11, v9
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v10, v9
+; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, v5, v9
+; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v13, v8
+; TONGA-NEXT:    v_subb_u32_e64 v10, s[0:1], v10, v1, vcc
+; TONGA-NEXT:    v_sub_u32_e64 v11, s[0:1], v8, v14
+; TONGA-NEXT:    v_subbrev_u32_e64 v13, s[2:3], 0, v10, s[0:1]
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v13, v1
+; TONGA-NEXT:    v_cndmask_b32_e64 v15, 0, -1, s[2:3]
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v11, v14
+; TONGA-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s[2:3]
+; TONGA-NEXT:    v_cmp_eq_u32_e64 s[2:3], v13, v1
+; TONGA-NEXT:    v_subb_u32_e64 v10, s[0:1], v10, v1, s[0:1]
+; TONGA-NEXT:    v_cndmask_b32_e64 v15, v15, v16, s[2:3]
+; TONGA-NEXT:    v_sub_u32_e64 v16, s[0:1], v11, v14
+; TONGA-NEXT:    v_subb_u32_e32 v5, vcc, v5, v9, vcc
+; TONGA-NEXT:    v_subbrev_u32_e64 v10, s[0:1], 0, v10, s[0:1]
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v1
+; TONGA-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v15
+; TONGA-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v14
+; TONGA-NEXT:    v_cndmask_b32_e64 v10, v13, v10, s[0:1]
+; TONGA-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; TONGA-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v1
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v9, v13, vcc
+; TONGA-NEXT:    v_cndmask_b32_e64 v11, v11, v16, s[0:1]
+; TONGA-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v5, v10, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v5, v8, v11, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v5, v5, v12
+; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v12
+; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v5, v12
+; TONGA-NEXT:    v_subb_u32_e32 v9, vcc, v1, v12, vcc
+; TONGA-NEXT:    s_cbranch_execnz .LBB10_3
 ; TONGA-NEXT:  .LBB10_2:
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, s0
-; TONGA-NEXT:    s_sub_i32 s1, 0, s0
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
 ; TONGA-NEXT:    v_mov_b32_e32 v9, 0
-; TONGA-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; TONGA-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; TONGA-NEXT:    v_mul_lo_u32 v1, s1, v0
-; TONGA-NEXT:    v_mul_hi_u32 v1, v0, v1
-; TONGA-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
-; TONGA-NEXT:    v_mul_hi_u32 v0, s2, v0
-; TONGA-NEXT:    v_mul_lo_u32 v0, v0, s0
-; TONGA-NEXT:    v_sub_u32_e32 v0, vcc, s2, v0
-; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v0
-; TONGA-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
-; TONGA-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v0
-; TONGA-NEXT:    v_cmp_le_u32_e32 vcc, s0, v0
-; TONGA-NEXT:    v_cndmask_b32_e32 v8, v0, v1, vcc
-; TONGA-NEXT:    s_branch .LBB10_5
+; TONGA-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; TONGA-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; TONGA-NEXT:    v_mul_lo_u32 v5, v5, v1
+; TONGA-NEXT:    v_mul_hi_u32 v5, v1, v5
+; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
+; TONGA-NEXT:    v_mul_hi_u32 v1, v4, v1
+; TONGA-NEXT:    v_mul_lo_u32 v1, v1, v0
+; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, v4, v1
+; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v0, v1
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
+; TONGA-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; TONGA-NEXT:    v_sub_u32_e32 v4, vcc, v1, v0
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
+; TONGA-NEXT:    v_cndmask_b32_e32 v8, v1, v4, vcc
 ; TONGA-NEXT:  .LBB10_3:
-; TONGA-NEXT:    ; implicit-def: $sgpr6_sgpr7
-; TONGA-NEXT:    s_branch .LBB10_2
-; TONGA-NEXT:  .LBB10_4:
-; TONGA-NEXT:    v_mov_b32_e32 v9, s7
-; TONGA-NEXT:    v_mov_b32_e32 v8, s6
-; TONGA-NEXT:  .LBB10_5:
 ; TONGA-NEXT:    v_or_b32_e32 v1, v7, v3
 ; TONGA-NEXT:    v_mov_b32_e32 v0, 0
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; TONGA-NEXT:    s_cbranch_vccz .LBB10_9
-; TONGA-NEXT:  ; %bb.6:
+; TONGA-NEXT:    s_cbranch_vccz .LBB10_8
+; TONGA-NEXT:  ; %bb.4:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v2, v0
 ; TONGA-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
@@ -3604,8 +3579,8 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v11
 ; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, v0, v11
 ; TONGA-NEXT:    v_subb_u32_e32 v11, vcc, v1, v11, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB10_8
-; TONGA-NEXT:  .LBB10_7:
+; TONGA-NEXT:    s_cbranch_execnz .LBB10_6
+; TONGA-NEXT:  .LBB10_5:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, 0, v2
 ; TONGA-NEXT:    v_mov_b32_e32 v11, 0
@@ -3624,13 +3599,16 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v2, v0
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; TONGA-NEXT:    v_cndmask_b32_e32 v10, v0, v1, vcc
-; TONGA-NEXT:  .LBB10_8:
+; TONGA-NEXT:  .LBB10_6:
 ; TONGA-NEXT:    v_mov_b32_e32 v0, s4
 ; TONGA-NEXT:    v_mov_b32_e32 v1, s5
 ; TONGA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
 ; TONGA-NEXT:    s_endpgm
-; TONGA-NEXT:  .LBB10_9:
-; TONGA-NEXT:    s_branch .LBB10_7
+; TONGA-NEXT:  .LBB10_7:
+; TONGA-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; TONGA-NEXT:    s_branch .LBB10_2
+; TONGA-NEXT:  .LBB10_8:
+; TONGA-NEXT:    s_branch .LBB10_5
 ;
 ; EG-LABEL: srem_v2i64:
 ; EG:       ; %bb.0:
@@ -6118,6 +6096,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-LABEL: srem_v4i64:
 ; TONGA:       ; %bb.0:
 ; TONGA-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x24
+; TONGA-NEXT:    v_mov_b32_e32 v8, 0
 ; TONGA-NEXT:    s_waitcnt lgkmcnt(0)
 ; TONGA-NEXT:    s_add_u32 s0, s6, 48
 ; TONGA-NEXT:    v_mov_b32_e32 v0, s6
@@ -6137,275 +6116,249 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_mov_b32_e32 v4, s0
 ; TONGA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; TONGA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
-; TONGA-NEXT:    s_waitcnt vmcnt(3)
-; TONGA-NEXT:    v_readfirstlane_b32 s3, v15
-; TONGA-NEXT:    v_readfirstlane_b32 s2, v14
 ; TONGA-NEXT:    s_waitcnt vmcnt(2)
-; TONGA-NEXT:    v_readfirstlane_b32 s1, v11
-; TONGA-NEXT:    v_readfirstlane_b32 s0, v10
-; TONGA-NEXT:    s_or_b64 s[6:7], s[2:3], s[0:1]
-; TONGA-NEXT:    s_mov_b32 s6, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; TONGA-NEXT:    s_cbranch_scc0 .LBB12_3
+; TONGA-NEXT:    v_or_b32_e32 v9, v15, v11
+; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; TONGA-NEXT:    s_cbranch_vccz .LBB12_13
 ; TONGA-NEXT:  ; %bb.1:
-; TONGA-NEXT:    s_ashr_i32 s6, s1, 31
-; TONGA-NEXT:    s_add_u32 s8, s0, s6
-; TONGA-NEXT:    s_mov_b32 s7, s6
-; TONGA-NEXT:    s_addc_u32 s9, s1, s6
-; TONGA-NEXT:    s_xor_b64 s[6:7], s[8:9], s[6:7]
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, s6
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v9, s7
-; TONGA-NEXT:    s_sub_u32 s1, 0, s6
-; TONGA-NEXT:    s_subb_u32 s10, 0, s7
-; TONGA-NEXT:    v_madmk_f32 v8, v9, 0x4f800000, v8
-; TONGA-NEXT:    v_rcp_f32_e32 v8, v8
-; TONGA-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
-; TONGA-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
-; TONGA-NEXT:    v_trunc_f32_e32 v9, v9
-; TONGA-NEXT:    v_madmk_f32 v8, v9, 0xcf800000, v8
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v14, v9
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v15, v8
-; TONGA-NEXT:    v_mul_lo_u32 v10, s1, v14
-; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[8:9], s1, v15, 0
-; TONGA-NEXT:    v_mul_lo_u32 v11, s10, v15
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v9, v10
-; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v9, v11
-; TONGA-NEXT:    v_mul_hi_u32 v18, v15, v8
-; TONGA-NEXT:    v_mad_u64_u32 v[9:10], s[8:9], v15, v11, 0
-; TONGA-NEXT:    v_add_u32_e32 v18, vcc, v18, v9
-; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[8:9], v14, v8, 0
-; TONGA-NEXT:    v_addc_u32_e32 v19, vcc, 0, v10, vcc
-; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v14, v11, 0
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v18, v8
-; TONGA-NEXT:    v_addc_u32_e32 v8, vcc, v19, v9, vcc
-; TONGA-NEXT:    v_addc_u32_e32 v9, vcc, 0, v11, vcc
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v8, v10
-; TONGA-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; TONGA-NEXT:    v_add_u32_e32 v18, vcc, v15, v8
-; TONGA-NEXT:    v_addc_u32_e32 v19, vcc, v14, v9, vcc
-; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[8:9], s1, v18, 0
-; TONGA-NEXT:    v_mul_lo_u32 v14, s1, v19
-; TONGA-NEXT:    v_mul_lo_u32 v15, s10, v18
-; TONGA-NEXT:    v_mul_hi_u32 v20, v18, v8
-; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], v19, v8, 0
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v14, v9
-; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v15, v9
-; TONGA-NEXT:    v_mad_u64_u32 v[14:15], s[8:9], v18, v9, 0
-; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[8:9], v19, v9, 0
-; TONGA-NEXT:    v_add_u32_e32 v14, vcc, v20, v14
-; TONGA-NEXT:    v_addc_u32_e32 v15, vcc, 0, v15, vcc
-; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v14, v10
-; TONGA-NEXT:    v_addc_u32_e32 v10, vcc, v15, v11, vcc
-; TONGA-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v10, v8
-; TONGA-NEXT:    s_ashr_i32 s10, s3, 31
-; TONGA-NEXT:    v_addc_u32_e32 v9, vcc, 0, v9, vcc
-; TONGA-NEXT:    s_add_u32 s8, s2, s10
-; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v18, v8
-; TONGA-NEXT:    s_mov_b32 s11, s10
-; TONGA-NEXT:    s_addc_u32 s9, s3, s10
-; TONGA-NEXT:    v_addc_u32_e32 v11, vcc, v19, v9, vcc
-; TONGA-NEXT:    s_xor_b64 s[12:13], s[8:9], s[10:11]
-; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[8:9], s12, v11, 0
-; TONGA-NEXT:    v_mul_hi_u32 v14, s12, v10
-; TONGA-NEXT:    v_readfirstlane_b32 s1, v9
-; TONGA-NEXT:    v_readfirstlane_b32 s3, v8
-; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[8:9], s13, v11, 0
-; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[8:9], s13, v10, 0
-; TONGA-NEXT:    v_readfirstlane_b32 s14, v14
-; TONGA-NEXT:    s_add_u32 s3, s14, s3
-; TONGA-NEXT:    s_addc_u32 s1, 0, s1
-; TONGA-NEXT:    v_readfirstlane_b32 s14, v10
-; TONGA-NEXT:    v_readfirstlane_b32 s9, v11
-; TONGA-NEXT:    s_add_u32 s3, s3, s14
-; TONGA-NEXT:    v_readfirstlane_b32 s8, v9
-; TONGA-NEXT:    s_addc_u32 s1, s1, s9
-; TONGA-NEXT:    s_addc_u32 s3, s8, 0
-; TONGA-NEXT:    v_readfirstlane_b32 s8, v8
-; TONGA-NEXT:    s_add_u32 s1, s1, s8
-; TONGA-NEXT:    v_mov_b32_e32 v8, s1
-; TONGA-NEXT:    v_mad_u64_u32 v[8:9], s[8:9], s6, v8, 0
-; TONGA-NEXT:    s_addc_u32 s3, 0, s3
-; TONGA-NEXT:    s_mul_i32 s3, s6, s3
-; TONGA-NEXT:    v_readfirstlane_b32 s14, v9
-; TONGA-NEXT:    s_add_i32 s3, s14, s3
-; TONGA-NEXT:    s_mul_i32 s1, s7, s1
-; TONGA-NEXT:    s_add_i32 s3, s3, s1
-; TONGA-NEXT:    s_sub_i32 s1, s13, s3
-; TONGA-NEXT:    v_readfirstlane_b32 s14, v8
-; TONGA-NEXT:    s_sub_u32 s12, s12, s14
-; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s18, s12, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_subb_u32 s19, s1, 0
-; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
-; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
-; TONGA-NEXT:    s_cmp_ge_u32 s18, s6
-; TONGA-NEXT:    s_cselect_b32 s21, -1, 0
-; TONGA-NEXT:    s_cmp_eq_u32 s19, s7
-; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s16, s18, s6
-; TONGA-NEXT:    s_subb_u32 s1, s1, 0
-; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
-; TONGA-NEXT:    s_cselect_b32 s1, s1, s19
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; TONGA-NEXT:    s_subb_u32 s3, s13, s3
-; TONGA-NEXT:    s_cmp_ge_u32 s3, s7
-; TONGA-NEXT:    s_cselect_b32 s13, -1, 0
-; TONGA-NEXT:    s_cmp_ge_u32 s12, s6
-; TONGA-NEXT:    s_cselect_b32 s6, -1, 0
-; TONGA-NEXT:    s_cmp_eq_u32 s3, s7
-; TONGA-NEXT:    s_cselect_b32 s6, s6, s13
-; TONGA-NEXT:    s_cmp_lg_u32 s6, 0
-; TONGA-NEXT:    s_cselect_b32 s7, s1, s3
-; TONGA-NEXT:    s_cselect_b32 s6, s16, s12
-; TONGA-NEXT:    s_xor_b64 s[6:7], s[6:7], s[10:11]
-; TONGA-NEXT:    s_sub_u32 s6, s6, s10
-; TONGA-NEXT:    s_subb_u32 s7, s7, s10
-; TONGA-NEXT:    s_cbranch_execnz .LBB12_4
+; TONGA-NEXT:    v_ashrrev_i32_e32 v8, 31, v11
+; TONGA-NEXT:    v_add_u32_e32 v9, vcc, v10, v8
+; TONGA-NEXT:    v_addc_u32_e32 v11, vcc, v11, v8, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v9, v9, v8
+; TONGA-NEXT:    v_xor_b32_e32 v8, v11, v8
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v11, v9
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v18, v8
+; TONGA-NEXT:    v_sub_u32_e32 v23, vcc, 0, v9
+; TONGA-NEXT:    v_subb_u32_e32 v24, vcc, 0, v8, vcc
+; TONGA-NEXT:    v_madmk_f32 v11, v18, 0x4f800000, v11
+; TONGA-NEXT:    v_rcp_f32_e32 v11, v11
+; TONGA-NEXT:    v_mul_f32_e32 v11, 0x5f7ffffc, v11
+; TONGA-NEXT:    v_mul_f32_e32 v18, 0x2f800000, v11
+; TONGA-NEXT:    v_trunc_f32_e32 v18, v18
+; TONGA-NEXT:    v_madmk_f32 v11, v18, 0xcf800000, v11
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v22, v18
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; TONGA-NEXT:    v_mul_lo_u32 v20, v23, v22
+; TONGA-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0
+; TONGA-NEXT:    v_mul_lo_u32 v21, v24, v11
+; TONGA-NEXT:    v_add_u32_e32 v19, vcc, v19, v20
+; TONGA-NEXT:    v_add_u32_e32 v21, vcc, v19, v21
+; TONGA-NEXT:    v_mad_u64_u32 v[19:20], s[0:1], v11, v21, 0
+; TONGA-NEXT:    v_mul_hi_u32 v25, v11, v18
+; TONGA-NEXT:    v_add_u32_e32 v25, vcc, v25, v19
+; TONGA-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v22, v18, 0
+; TONGA-NEXT:    v_addc_u32_e32 v26, vcc, 0, v20, vcc
+; TONGA-NEXT:    v_mad_u64_u32 v[20:21], s[0:1], v22, v21, 0
+; TONGA-NEXT:    v_add_u32_e32 v18, vcc, v25, v18
+; TONGA-NEXT:    v_addc_u32_e32 v18, vcc, v26, v19, vcc
+; TONGA-NEXT:    v_addc_u32_e32 v19, vcc, 0, v21, vcc
+; TONGA-NEXT:    v_add_u32_e32 v18, vcc, v18, v20
+; TONGA-NEXT:    v_addc_u32_e32 v19, vcc, 0, v19, vcc
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v11, v18
+; TONGA-NEXT:    v_addc_u32_e32 v25, vcc, v22, v19, vcc
+; TONGA-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0
+; TONGA-NEXT:    v_mul_lo_u32 v22, v23, v25
+; TONGA-NEXT:    v_mul_lo_u32 v23, v24, v11
+; TONGA-NEXT:    v_mul_hi_u32 v24, v11, v18
+; TONGA-NEXT:    v_mad_u64_u32 v[20:21], s[0:1], v25, v18, 0
+; TONGA-NEXT:    v_add_u32_e32 v19, vcc, v22, v19
+; TONGA-NEXT:    v_add_u32_e32 v19, vcc, v23, v19
+; TONGA-NEXT:    v_mad_u64_u32 v[22:23], s[0:1], v11, v19, 0
+; TONGA-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v25, v19, 0
+; TONGA-NEXT:    v_add_u32_e32 v22, vcc, v24, v22
+; TONGA-NEXT:    v_addc_u32_e32 v23, vcc, 0, v23, vcc
+; TONGA-NEXT:    v_add_u32_e32 v20, vcc, v22, v20
+; TONGA-NEXT:    v_addc_u32_e32 v20, vcc, v23, v21, vcc
+; TONGA-NEXT:    v_addc_u32_e32 v19, vcc, 0, v19, vcc
+; TONGA-NEXT:    v_add_u32_e32 v18, vcc, v20, v18
+; TONGA-NEXT:    v_addc_u32_e32 v19, vcc, 0, v19, vcc
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v11, v18
+; TONGA-NEXT:    v_addc_u32_e32 v20, vcc, v25, v19, vcc
+; TONGA-NEXT:    v_ashrrev_i32_e32 v22, 31, v15
+; TONGA-NEXT:    v_add_u32_e32 v18, vcc, v14, v22
+; TONGA-NEXT:    v_xor_b32_e32 v23, v18, v22
+; TONGA-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v23, v20, 0
+; TONGA-NEXT:    v_mul_hi_u32 v21, v23, v11
+; TONGA-NEXT:    v_addc_u32_e32 v15, vcc, v15, v22, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v15, v15, v22
+; TONGA-NEXT:    v_add_u32_e32 v24, vcc, v21, v18
+; TONGA-NEXT:    v_addc_u32_e32 v25, vcc, 0, v19, vcc
+; TONGA-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v15, v11, 0
+; TONGA-NEXT:    v_mad_u64_u32 v[20:21], s[0:1], v15, v20, 0
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v24, v18
+; TONGA-NEXT:    v_addc_u32_e32 v11, vcc, v25, v19, vcc
+; TONGA-NEXT:    v_addc_u32_e32 v18, vcc, 0, v21, vcc
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v11, v20
+; TONGA-NEXT:    v_addc_u32_e32 v18, vcc, 0, v18, vcc
+; TONGA-NEXT:    v_mul_lo_u32 v20, v9, v18
+; TONGA-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v9, v11, 0
+; TONGA-NEXT:    v_mul_lo_u32 v11, v8, v11
+; TONGA-NEXT:    v_add_u32_e32 v19, vcc, v20, v19
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v11, v19
+; TONGA-NEXT:    v_sub_u32_e32 v19, vcc, v15, v11
+; TONGA-NEXT:    v_sub_u32_e32 v18, vcc, v23, v18
+; TONGA-NEXT:    v_subb_u32_e64 v19, s[0:1], v19, v8, vcc
+; TONGA-NEXT:    v_sub_u32_e64 v20, s[0:1], v18, v9
+; TONGA-NEXT:    v_subbrev_u32_e64 v21, s[2:3], 0, v19, s[0:1]
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v21, v8
+; TONGA-NEXT:    v_cndmask_b32_e64 v23, 0, -1, s[2:3]
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v20, v9
+; TONGA-NEXT:    v_subb_u32_e32 v11, vcc, v15, v11, vcc
+; TONGA-NEXT:    v_cndmask_b32_e64 v24, 0, -1, s[2:3]
+; TONGA-NEXT:    v_cmp_eq_u32_e64 s[2:3], v21, v8
+; TONGA-NEXT:    v_subb_u32_e64 v19, s[0:1], v19, v8, s[0:1]
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v8
+; TONGA-NEXT:    v_cndmask_b32_e64 v23, v23, v24, s[2:3]
+; TONGA-NEXT:    v_sub_u32_e64 v24, s[0:1], v20, v9
+; TONGA-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v18, v9
+; TONGA-NEXT:    v_subbrev_u32_e64 v19, s[0:1], 0, v19, s[0:1]
+; TONGA-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; TONGA-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v8
+; TONGA-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v23
+; TONGA-NEXT:    v_cndmask_b32_e32 v8, v15, v9, vcc
+; TONGA-NEXT:    v_cndmask_b32_e64 v20, v20, v24, s[0:1]
+; TONGA-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; TONGA-NEXT:    v_cndmask_b32_e64 v19, v21, v19, s[0:1]
+; TONGA-NEXT:    v_cndmask_b32_e32 v9, v18, v20, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v8, v11, v19, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v9, v9, v22
+; TONGA-NEXT:    v_xor_b32_e32 v11, v8, v22
+; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v9, v22
+; TONGA-NEXT:    v_subb_u32_e32 v9, vcc, v11, v22, vcc
+; TONGA-NEXT:    s_cbranch_execnz .LBB12_3
 ; TONGA-NEXT:  .LBB12_2:
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, s0
-; TONGA-NEXT:    s_sub_i32 s1, 0, s0
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v8, v10
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, 0, v10
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v8, v8
 ; TONGA-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
 ; TONGA-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; TONGA-NEXT:    v_mul_lo_u32 v9, s1, v8
+; TONGA-NEXT:    v_mul_lo_u32 v9, v9, v8
 ; TONGA-NEXT:    v_mul_hi_u32 v9, v8, v9
 ; TONGA-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
-; TONGA-NEXT:    v_mul_hi_u32 v8, s2, v8
-; TONGA-NEXT:    v_mul_lo_u32 v8, v8, s0
-; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, s2, v8
-; TONGA-NEXT:    v_subrev_u32_e32 v9, vcc, s0, v8
-; TONGA-NEXT:    v_cmp_le_u32_e32 vcc, s0, v8
+; TONGA-NEXT:    v_mul_hi_u32 v8, v14, v8
+; TONGA-NEXT:    v_mul_lo_u32 v8, v8, v10
+; TONGA-NEXT:    v_sub_u32_e32 v8, vcc, v14, v8
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v8, v10
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v10
 ; TONGA-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; TONGA-NEXT:    v_subrev_u32_e32 v9, vcc, s0, v8
-; TONGA-NEXT:    v_cmp_le_u32_e32 vcc, s0, v8
+; TONGA-NEXT:    v_sub_u32_e32 v9, vcc, v8, v10
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v10
 ; TONGA-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
 ; TONGA-NEXT:    v_mov_b32_e32 v9, 0
-; TONGA-NEXT:    s_branch .LBB12_5
 ; TONGA-NEXT:  .LBB12_3:
-; TONGA-NEXT:    ; implicit-def: $sgpr6_sgpr7
-; TONGA-NEXT:    s_branch .LBB12_2
-; TONGA-NEXT:  .LBB12_4:
-; TONGA-NEXT:    v_mov_b32_e32 v9, s7
-; TONGA-NEXT:    v_mov_b32_e32 v8, s6
-; TONGA-NEXT:  .LBB12_5:
 ; TONGA-NEXT:    v_or_b32_e32 v11, v17, v13
 ; TONGA-NEXT:    v_mov_b32_e32 v10, 0
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
-; TONGA-NEXT:    s_cbranch_vccz .LBB12_15
-; TONGA-NEXT:  ; %bb.6:
+; TONGA-NEXT:    s_cbranch_vccz .LBB12_14
+; TONGA-NEXT:  ; %bb.4:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v10, 31, v13
 ; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v12, v10
 ; TONGA-NEXT:    v_addc_u32_e32 v13, vcc, v13, v10, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v11, v11, v10
-; TONGA-NEXT:    v_xor_b32_e32 v10, v13, v10
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v13, v11
-; TONGA-NEXT:    v_cvt_f32_u32_e32 v14, v10
-; TONGA-NEXT:    v_sub_u32_e32 v22, vcc, 0, v11
-; TONGA-NEXT:    v_subb_u32_e32 v23, vcc, 0, v10, vcc
-; TONGA-NEXT:    v_madmk_f32 v13, v14, 0x4f800000, v13
-; TONGA-NEXT:    v_rcp_f32_e32 v13, v13
-; TONGA-NEXT:    v_mul_f32_e32 v13, 0x5f7ffffc, v13
-; TONGA-NEXT:    v_mul_f32_e32 v14, 0x2f800000, v13
-; TONGA-NEXT:    v_trunc_f32_e32 v14, v14
-; TONGA-NEXT:    v_madmk_f32 v13, v14, 0xcf800000, v13
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v20, v14
-; TONGA-NEXT:    v_cvt_u32_f32_e32 v21, v13
-; TONGA-NEXT:    v_mul_lo_u32 v15, v22, v20
-; TONGA-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v22, v21, 0
-; TONGA-NEXT:    v_mul_lo_u32 v18, v23, v21
-; TONGA-NEXT:    v_add_u32_e32 v14, vcc, v14, v15
-; TONGA-NEXT:    v_add_u32_e32 v18, vcc, v14, v18
-; TONGA-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v21, v18, 0
-; TONGA-NEXT:    v_mul_hi_u32 v19, v21, v13
-; TONGA-NEXT:    v_add_u32_e32 v24, vcc, v19, v14
-; TONGA-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v20, v13, 0
-; TONGA-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v20, v18, 0
-; TONGA-NEXT:    v_addc_u32_e32 v15, vcc, 0, v15, vcc
-; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v24, v13
-; TONGA-NEXT:    v_addc_u32_e32 v13, vcc, v15, v14, vcc
-; TONGA-NEXT:    v_addc_u32_e32 v14, vcc, 0, v19, vcc
-; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v13, v18
-; TONGA-NEXT:    v_addc_u32_e32 v14, vcc, 0, v14, vcc
-; TONGA-NEXT:    v_add_u32_e32 v24, vcc, v21, v13
-; TONGA-NEXT:    v_addc_u32_e32 v25, vcc, v20, v14, vcc
-; TONGA-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v22, v24, 0
-; TONGA-NEXT:    v_mul_lo_u32 v15, v22, v25
-; TONGA-NEXT:    v_mul_lo_u32 v20, v23, v24
-; TONGA-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v25, v13, 0
-; TONGA-NEXT:    v_add_u32_e32 v14, vcc, v15, v14
-; TONGA-NEXT:    v_add_u32_e32 v20, vcc, v20, v14
-; TONGA-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v24, v20, 0
-; TONGA-NEXT:    v_mul_hi_u32 v13, v24, v13
-; TONGA-NEXT:    v_mad_u64_u32 v[20:21], s[0:1], v25, v20, 0
-; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v13, v14
-; TONGA-NEXT:    v_addc_u32_e32 v14, vcc, 0, v15, vcc
-; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v13, v18
-; TONGA-NEXT:    v_addc_u32_e32 v13, vcc, v14, v19, vcc
-; TONGA-NEXT:    v_addc_u32_e32 v14, vcc, 0, v21, vcc
-; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v13, v20
-; TONGA-NEXT:    v_addc_u32_e32 v14, vcc, 0, v14, vcc
-; TONGA-NEXT:    v_add_u32_e32 v15, vcc, v24, v13
-; TONGA-NEXT:    v_addc_u32_e32 v18, vcc, v25, v14, vcc
-; TONGA-NEXT:    v_ashrrev_i32_e32 v19, 31, v17
-; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v16, v19
-; TONGA-NEXT:    v_xor_b32_e32 v20, v13, v19
-; TONGA-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v20, v18, 0
-; TONGA-NEXT:    v_mul_hi_u32 v21, v20, v15
-; TONGA-NEXT:    v_addc_u32_e32 v17, vcc, v17, v19, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v22, v17, v19
-; TONGA-NEXT:    v_add_u32_e32 v21, vcc, v21, v13
-; TONGA-NEXT:    v_addc_u32_e32 v23, vcc, 0, v14, vcc
-; TONGA-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v22, v15, 0
-; TONGA-NEXT:    v_mad_u64_u32 v[17:18], s[0:1], v22, v18, 0
-; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v21, v13
-; TONGA-NEXT:    v_addc_u32_e32 v13, vcc, v23, v14, vcc
-; TONGA-NEXT:    v_addc_u32_e32 v14, vcc, 0, v18, vcc
-; TONGA-NEXT:    v_add_u32_e32 v15, vcc, v13, v17
-; TONGA-NEXT:    v_addc_u32_e32 v13, vcc, 0, v14, vcc
-; TONGA-NEXT:    v_mul_lo_u32 v17, v11, v13
-; TONGA-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v11, v15, 0
-; TONGA-NEXT:    v_mul_lo_u32 v15, v10, v15
-; TONGA-NEXT:    v_add_u32_e32 v14, vcc, v17, v14
-; TONGA-NEXT:    v_add_u32_e32 v14, vcc, v15, v14
-; TONGA-NEXT:    v_sub_u32_e32 v15, vcc, v22, v14
-; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, v20, v13
-; TONGA-NEXT:    v_subb_u32_e64 v15, s[0:1], v15, v10, vcc
-; TONGA-NEXT:    v_sub_u32_e64 v17, s[0:1], v13, v11
-; TONGA-NEXT:    v_subbrev_u32_e64 v18, s[2:3], 0, v15, s[0:1]
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v18, v10
-; TONGA-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[2:3]
-; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v17, v11
+; TONGA-NEXT:    v_xor_b32_e32 v15, v11, v10
+; TONGA-NEXT:    v_xor_b32_e32 v20, v13, v10
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v15
+; TONGA-NEXT:    v_cvt_f32_u32_e32 v11, v20
+; TONGA-NEXT:    v_sub_u32_e32 v21, vcc, 0, v15
+; TONGA-NEXT:    v_subb_u32_e32 v22, vcc, 0, v20, vcc
+; TONGA-NEXT:    v_madmk_f32 v10, v11, 0x4f800000, v10
+; TONGA-NEXT:    v_rcp_f32_e32 v10, v10
+; TONGA-NEXT:    v_mul_f32_e32 v10, 0x5f7ffffc, v10
+; TONGA-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v10
+; TONGA-NEXT:    v_trunc_f32_e32 v11, v11
+; TONGA-NEXT:    v_madmk_f32 v10, v11, 0xcf800000, v10
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v18, v11
+; TONGA-NEXT:    v_cvt_u32_f32_e32 v19, v10
+; TONGA-NEXT:    v_mul_lo_u32 v13, v21, v18
+; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v21, v19, 0
+; TONGA-NEXT:    v_mul_lo_u32 v14, v22, v19
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v11, v13
+; TONGA-NEXT:    v_add_u32_e32 v23, vcc, v11, v14
+; TONGA-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v19, v23, 0
+; TONGA-NEXT:    v_mul_hi_u32 v11, v19, v10
+; TONGA-NEXT:    v_add_u32_e32 v24, vcc, v11, v13
+; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v18, v10, 0
+; TONGA-NEXT:    v_addc_u32_e32 v25, vcc, 0, v14, vcc
+; TONGA-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v18, v23, 0
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v24, v10
+; TONGA-NEXT:    v_addc_u32_e32 v10, vcc, v25, v11, vcc
+; TONGA-NEXT:    v_addc_u32_e32 v11, vcc, 0, v14, vcc
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v10, v13
+; TONGA-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; TONGA-NEXT:    v_add_u32_e32 v23, vcc, v19, v10
+; TONGA-NEXT:    v_addc_u32_e32 v24, vcc, v18, v11, vcc
+; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v21, v23, 0
+; TONGA-NEXT:    v_mul_lo_u32 v18, v21, v24
+; TONGA-NEXT:    v_mul_lo_u32 v19, v22, v23
+; TONGA-NEXT:    v_mul_hi_u32 v21, v23, v10
+; TONGA-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v24, v10, 0
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v18, v11
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v19, v11
+; TONGA-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v23, v11, 0
+; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v24, v11, 0
+; TONGA-NEXT:    v_add_u32_e32 v18, vcc, v21, v18
+; TONGA-NEXT:    v_addc_u32_e32 v19, vcc, 0, v19, vcc
+; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v18, v13
+; TONGA-NEXT:    v_addc_u32_e32 v13, vcc, v19, v14, vcc
+; TONGA-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v13, v10
+; TONGA-NEXT:    v_addc_u32_e32 v11, vcc, 0, v11, vcc
+; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v23, v10
+; TONGA-NEXT:    v_addc_u32_e32 v14, vcc, v24, v11, vcc
+; TONGA-NEXT:    v_ashrrev_i32_e32 v18, 31, v17
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v16, v18
+; TONGA-NEXT:    v_xor_b32_e32 v19, v10, v18
+; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v19, v14, 0
+; TONGA-NEXT:    v_mul_hi_u32 v21, v19, v13
+; TONGA-NEXT:    v_addc_u32_e32 v17, vcc, v17, v18, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v17, v17, v18
+; TONGA-NEXT:    v_add_u32_e32 v21, vcc, v21, v10
+; TONGA-NEXT:    v_addc_u32_e32 v22, vcc, 0, v11, vcc
+; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v17, v13, 0
+; TONGA-NEXT:    v_mad_u64_u32 v[13:14], s[0:1], v17, v14, 0
+; TONGA-NEXT:    v_add_u32_e32 v10, vcc, v21, v10
+; TONGA-NEXT:    v_addc_u32_e32 v10, vcc, v22, v11, vcc
+; TONGA-NEXT:    v_addc_u32_e32 v11, vcc, 0, v14, vcc
+; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v10, v13
+; TONGA-NEXT:    v_addc_u32_e32 v10, vcc, 0, v11, vcc
+; TONGA-NEXT:    v_mul_lo_u32 v14, v15, v10
+; TONGA-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v15, v13, 0
+; TONGA-NEXT:    v_mul_lo_u32 v13, v20, v13
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v14, v11
+; TONGA-NEXT:    v_add_u32_e32 v11, vcc, v13, v11
+; TONGA-NEXT:    v_sub_u32_e32 v13, vcc, v17, v11
+; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, v19, v10
+; TONGA-NEXT:    v_subb_u32_e64 v13, s[0:1], v13, v20, vcc
+; TONGA-NEXT:    v_sub_u32_e64 v14, s[0:1], v10, v15
+; TONGA-NEXT:    v_subbrev_u32_e64 v19, s[2:3], 0, v13, s[0:1]
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v19, v20
 ; TONGA-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[2:3]
-; TONGA-NEXT:    v_cmp_eq_u32_e64 s[2:3], v18, v10
-; TONGA-NEXT:    v_subb_u32_e64 v15, s[0:1], v15, v10, s[0:1]
-; TONGA-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[2:3]
-; TONGA-NEXT:    v_sub_u32_e64 v21, s[0:1], v17, v11
-; TONGA-NEXT:    v_subbrev_u32_e64 v15, s[0:1], 0, v15, s[0:1]
-; TONGA-NEXT:    v_subb_u32_e32 v14, vcc, v22, v14, vcc
-; TONGA-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v20
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v14, v10
-; TONGA-NEXT:    v_cndmask_b32_e64 v15, v18, v15, s[0:1]
-; TONGA-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
-; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v11
-; TONGA-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
-; TONGA-NEXT:    v_cmp_eq_u32_e32 vcc, v14, v10
-; TONGA-NEXT:    v_cndmask_b32_e32 v10, v18, v11, vcc
-; TONGA-NEXT:    v_cndmask_b32_e64 v17, v17, v21, s[0:1]
-; TONGA-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; TONGA-NEXT:    v_cndmask_b32_e32 v11, v13, v17, vcc
-; TONGA-NEXT:    v_cndmask_b32_e32 v10, v14, v15, vcc
-; TONGA-NEXT:    v_xor_b32_e32 v11, v11, v19
-; TONGA-NEXT:    v_xor_b32_e32 v13, v10, v19
-; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, v11, v19
-; TONGA-NEXT:    v_subb_u32_e32 v11, vcc, v13, v19, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB12_8
-; TONGA-NEXT:  .LBB12_7:
+; TONGA-NEXT:    v_cmp_ge_u32_e64 s[2:3], v14, v15
+; TONGA-NEXT:    v_subb_u32_e32 v11, vcc, v17, v11, vcc
+; TONGA-NEXT:    v_cndmask_b32_e64 v22, 0, -1, s[2:3]
+; TONGA-NEXT:    v_cmp_eq_u32_e64 s[2:3], v19, v20
+; TONGA-NEXT:    v_subb_u32_e64 v13, s[0:1], v13, v20, s[0:1]
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v20
+; TONGA-NEXT:    v_cndmask_b32_e64 v21, v21, v22, s[2:3]
+; TONGA-NEXT:    v_sub_u32_e64 v22, s[0:1], v14, v15
+; TONGA-NEXT:    v_cndmask_b32_e64 v17, 0, -1, vcc
+; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v15
+; TONGA-NEXT:    v_subbrev_u32_e64 v13, s[0:1], 0, v13, s[0:1]
+; TONGA-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
+; TONGA-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v20
+; TONGA-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v21
+; TONGA-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc
+; TONGA-NEXT:    v_cndmask_b32_e64 v14, v14, v22, s[0:1]
+; TONGA-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; TONGA-NEXT:    v_cndmask_b32_e64 v13, v19, v13, s[0:1]
+; TONGA-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc
+; TONGA-NEXT:    v_cndmask_b32_e32 v11, v11, v13, vcc
+; TONGA-NEXT:    v_xor_b32_e32 v10, v10, v18
+; TONGA-NEXT:    v_xor_b32_e32 v11, v11, v18
+; TONGA-NEXT:    v_sub_u32_e32 v10, vcc, v10, v18
+; TONGA-NEXT:    v_subb_u32_e32 v11, vcc, v11, v18, vcc
+; TONGA-NEXT:    s_cbranch_execnz .LBB12_6
+; TONGA-NEXT:  .LBB12_5:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v10, v12
 ; TONGA-NEXT:    v_sub_u32_e32 v11, vcc, 0, v12
 ; TONGA-NEXT:    v_rcp_iflag_f32_e32 v10, v10
@@ -6424,13 +6377,13 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v12
 ; TONGA-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
 ; TONGA-NEXT:    v_mov_b32_e32 v11, 0
-; TONGA-NEXT:  .LBB12_8:
+; TONGA-NEXT:  .LBB12_6:
 ; TONGA-NEXT:    s_waitcnt vmcnt(0)
 ; TONGA-NEXT:    v_or_b32_e32 v13, v5, v1
 ; TONGA-NEXT:    v_mov_b32_e32 v12, 0
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
-; TONGA-NEXT:    s_cbranch_vccz .LBB12_16
-; TONGA-NEXT:  ; %bb.9:
+; TONGA-NEXT:    s_cbranch_vccz .LBB12_15
+; TONGA-NEXT:  ; %bb.7:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v12, 31, v1
 ; TONGA-NEXT:    v_add_u32_e32 v13, vcc, v0, v12
 ; TONGA-NEXT:    v_addc_u32_e32 v1, vcc, v1, v12, vcc
@@ -6536,8 +6489,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v16
 ; TONGA-NEXT:    v_sub_u32_e32 v12, vcc, v5, v16
 ; TONGA-NEXT:    v_subb_u32_e32 v13, vcc, v1, v16, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB12_11
-; TONGA-NEXT:  .LBB12_10:
+; TONGA-NEXT:    s_cbranch_execnz .LBB12_9
+; TONGA-NEXT:  .LBB12_8:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v1, v0
 ; TONGA-NEXT:    v_sub_u32_e32 v5, vcc, 0, v0
 ; TONGA-NEXT:    v_mov_b32_e32 v13, 0
@@ -6556,12 +6509,12 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subrev_u32_e32 v4, vcc, v0, v1
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
 ; TONGA-NEXT:    v_cndmask_b32_e32 v12, v1, v4, vcc
-; TONGA-NEXT:  .LBB12_11:
+; TONGA-NEXT:  .LBB12_9:
 ; TONGA-NEXT:    v_or_b32_e32 v1, v7, v3
 ; TONGA-NEXT:    v_mov_b32_e32 v0, 0
 ; TONGA-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; TONGA-NEXT:    s_cbranch_vccz .LBB12_17
-; TONGA-NEXT:  ; %bb.12:
+; TONGA-NEXT:    s_cbranch_vccz .LBB12_16
+; TONGA-NEXT:  ; %bb.10:
 ; TONGA-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
 ; TONGA-NEXT:    v_add_u32_e32 v1, vcc, v2, v0
 ; TONGA-NEXT:    v_addc_u32_e32 v3, vcc, v3, v0, vcc
@@ -6667,8 +6620,8 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_xor_b32_e32 v1, v1, v15
 ; TONGA-NEXT:    v_sub_u32_e32 v14, vcc, v0, v15
 ; TONGA-NEXT:    v_subb_u32_e32 v15, vcc, v1, v15, vcc
-; TONGA-NEXT:    s_cbranch_execnz .LBB12_14
-; TONGA-NEXT:  .LBB12_13:
+; TONGA-NEXT:    s_cbranch_execnz .LBB12_12
+; TONGA-NEXT:  .LBB12_11:
 ; TONGA-NEXT:    v_cvt_f32_u32_e32 v0, v2
 ; TONGA-NEXT:    v_sub_u32_e32 v1, vcc, 0, v2
 ; TONGA-NEXT:    v_mov_b32_e32 v15, 0
@@ -6687,7 +6640,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_subrev_u32_e32 v1, vcc, v2, v0
 ; TONGA-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; TONGA-NEXT:    v_cndmask_b32_e32 v14, v0, v1, vcc
-; TONGA-NEXT:  .LBB12_14:
+; TONGA-NEXT:  .LBB12_12:
 ; TONGA-NEXT:    v_mov_b32_e32 v0, s4
 ; TONGA-NEXT:    v_mov_b32_e32 v1, s5
 ; TONGA-NEXT:    s_add_u32 s0, s4, 16
@@ -6697,13 +6650,16 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_mov_b32_e32 v1, s1
 ; TONGA-NEXT:    flat_store_dwordx4 v[0:1], v[12:15]
 ; TONGA-NEXT:    s_endpgm
+; TONGA-NEXT:  .LBB12_13:
+; TONGA-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; TONGA-NEXT:    s_branch .LBB12_2
+; TONGA-NEXT:  .LBB12_14:
+; TONGA-NEXT:    s_branch .LBB12_5
 ; TONGA-NEXT:  .LBB12_15:
-; TONGA-NEXT:    s_branch .LBB12_7
-; TONGA-NEXT:  .LBB12_16:
 ; TONGA-NEXT:    ; implicit-def: $vgpr12_vgpr13
-; TONGA-NEXT:    s_branch .LBB12_10
-; TONGA-NEXT:  .LBB12_17:
-; TONGA-NEXT:    s_branch .LBB12_13
+; TONGA-NEXT:    s_branch .LBB12_8
+; TONGA-NEXT:  .LBB12_16:
+; TONGA-NEXT:    s_branch .LBB12_11
 ;
 ; EG-LABEL: srem_v4i64:
 ; EG:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
index d2008be4fd32a..d86a624695e96 100644
--- a/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
+++ b/llvm/test/CodeGen/AMDGPU/undef-handling-crash-in-ra.ll
@@ -51,7 +51,6 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    v_mov_b32_e32 v62, s66
 ; CHECK-NEXT:    v_mov_b32_e32 v63, s67
 ; CHECK-NEXT:    flat_store_dwordx2 v[58:59], a[32:33]
-; CHECK-NEXT:    ; kill: def $sgpr15 killed $sgpr15
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; CHECK-NEXT:    flat_load_dwordx2 v[60:61], v[58:59]
@@ -68,7 +67,6 @@ define amdgpu_kernel void @foo(ptr addrspace(5) %ptr5, ptr %p0, double %v0, <4 x
 ; CHECK-NEXT:    flat_store_dwordx2 v[46:47], v[44:45]
 ; CHECK-NEXT:    flat_store_dwordx2 v[58:59], a[32:33]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    ; kill: def $sgpr15 killed $sgpr15
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[54:55]
 ; CHECK-NEXT:    flat_load_dwordx2 v[0:1], v[56:57] glc
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)



More information about the llvm-commits mailing list