[llvm-branch-commits] [llvm] AMDGPU: Avoid default subtarget in codegen tests (4/9) (PR #205787)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Thu Jun 25 05:01:17 PDT 2026
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/205787
Continue migrating targets away from codegenning the dummy target
by script.
Co-Authored-By: Claude <noreply at anthropic.com> (Claude-Opus-4.8)
>From 850462eaab7a520530921b96a6d54ee0515b8087 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 25 Jun 2026 11:16:47 +0200
Subject: [PATCH] AMDGPU: Avoid default subtarget in codegen tests (4/9)
Continue migrating targets away from codegenning the dummy target
by script.
Co-Authored-By: Claude <noreply at anthropic.com> (Claude-Opus-4.8)
---
.../AMDGPU/GlobalISel/combine-add-nullptr.mir | 2 +-
.../AMDGPU/GlobalISel/combine-ashr-narrow.mir | 2 +-
.../combine-extract-vector-load.mir | 2 +-
.../AMDGPU/GlobalISel/combine-fabs-fneg.mir | 2 +-
.../GlobalISel/combine-fcanonicalize.mir | 2 +-
.../CodeGen/AMDGPU/GlobalISel/combine-fsh.mir | 2 +-
.../AMDGPU/GlobalISel/combine-fsub-fneg.mir | 2 +-
.../AMDGPU/GlobalISel/combine-itofp.mir | 2 +-
.../AMDGPU/GlobalISel/combine-lshr-narrow.mir | 2 +-
.../GlobalISel/combine-or-redundant.mir | 2 +-
.../GlobalISel/combine-redundant-and.mir | 2 +-
.../GlobalISel/combine-redundant-neg.mir | 2 +-
.../CodeGen/AMDGPU/GlobalISel/combine-rot.mir | 2 +-
.../GlobalISel/combine-shift-amount-zext.mir | 2 +-
.../combine-shift-imm-chain-illegal-types.mir | 2 +-
.../combine-shift-imm-chain-shlsat.mir | 6 +-
.../GlobalISel/combine-shift-imm-chain.ll | 2 +-
.../combine-shift-of-shifted-logic-shlsat.mir | 2 +-
.../combine-shift-of-shifted-logic.ll | 2 +-
.../AMDGPU/GlobalISel/combine-shifts.mir | 2 +-
.../AMDGPU/GlobalISel/combine-shl-narrow.mir | 2 +-
.../AMDGPU/GlobalISel/combine-trunc-shift.mir | 2 +-
.../CodeGen/AMDGPU/GlobalISel/dummy-target.ll | 2 +-
.../inst-select-amdgcn.s.barrier.mir | 2 +-
.../inst-select-amdgcn.s.sendmsg.mir | 2 +-
.../inst-select-amdgpu-ffbh-u32.mir | 2 +-
.../inst-select-amdgpu-ffbl-b32.mir | 2 +-
.../AMDGPU/GlobalISel/inst-select-anyext.mir | 2 +-
.../AMDGPU/GlobalISel/inst-select-br.mir | 2 +-
.../AMDGPU/GlobalISel/inst-select-brcond.mir | 2 +-
.../GlobalISel/inst-select-frame-index.mir | 2 +-
.../AMDGPU/GlobalISel/inst-select-phi.mir | 2 +-
.../AMDGPU/GlobalISel/inst-select-sext.mir | 2 +-
.../AMDGPU/GlobalISel/inst-select-trunc.mir | 2 +-
.../AMDGPU/GlobalISel/inst-select-zext.mir | 2 +-
.../GlobalISel/irtranslator-amdgcn-sendmsg.ll | 2 +-
.../GlobalISel/irtranslator-constantexpr.ll | 2 +-
.../GlobalISel/irtranslator-constrained-fp.ll | 2 +-
.../AMDGPU/GlobalISel/irtranslator-fence.ll | 2 +-
...translator-fixed-function-abi-vgpr-args.ll | 2 +-
.../GlobalISel/irtranslator-getelementptr.ll | 2 +-
.../GlobalISel/irtranslator-invariant.ll | 2 +-
.../irtranslator-memory-intrinsics.ll | 2 +-
.../GlobalISel/irtranslator-prefetch.ll | 2 +-
.../AMDGPU/GlobalISel/irtranslator-ptrmask.ll | 2 +-
.../AMDGPU/GlobalISel/irtranslator-sat.ll | 2 +-
.../irtranslator-struct-return-intrinsics.ll | 2 +-
.../GlobalISel/irtranslator-zext-vec-index.ll | 2 +-
.../AMDGPU/GlobalISel/knownbits-ptrtoint.mir | 2 +-
.../GlobalISel/legalize-build-vector.mir | 2 +-
.../legalize-extractelement-crash.mir | 2 +-
.../AMDGPU/GlobalISel/legalize-insert.mir | 2 +-
.../legalize-intrinsic-amdgcn-fdiv-fast.mir | 2 +-
.../AMDGPU/GlobalISel/legalize-inttoptr.mir | 2 +-
.../AMDGPU/GlobalISel/legalize-jump-table.mir | 2 +-
.../AMDGPU/GlobalISel/legalize-memcpy.mir | 2 +-
.../GlobalISel/legalize-memcpyinline.mir | 2 +-
.../AMDGPU/GlobalISel/legalize-memmove.mir | 2 +-
.../AMDGPU/GlobalISel/legalize-memset.mir | 2 +-
.../GlobalISel/legalize-memsetinline.mir | 2 +-
.../legalize-merge-values-build-vector.mir | 2 +-
.../AMDGPU/GlobalISel/legalize-ptrtoint.mir | 2 +-
.../GlobalISel/legalize-unmerge-values.mir | 2 +-
.../AMDGPU/GlobalISel/llvm.memcpy.inline.ll | 4 +-
.../CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll | 299 +-
.../CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll | 6 +-
.../AMDGPU/GlobalISel/llvm.memset.inline.ll | 4 +-
.../CodeGen/AMDGPU/GlobalISel/llvm.memset.ll | 36 +-
.../AMDGPU/GlobalISel/merge-buffer-stores.ll | 22 +-
.../CodeGen/AMDGPU/GlobalISel/minmaxabs.ll | 2 +-
.../postlegalizer-combiner-reassoc.mir | 2 +-
.../postlegalizercombiner-select.mir | 2 +-
.../regbankselect-amdgcn.image.sample.1d.ll | 4 +-
.../regbankselect-amdgcn.raw.buffer.load.ll | 4 +-
...egbankselect-amdgcn.raw.ptr.buffer.load.ll | 2 +-
...regbankselect-amdgcn.struct.buffer.load.ll | 4 +-
...egbankselect-amdgcn.struct.buffer.store.ll | 2 +-
...ankselect-amdgcn.struct.ptr.buffer.load.ll | 4 +-
...nkselect-amdgcn.struct.ptr.buffer.store.ll | 2 +-
.../AMDGPU/GlobalISel/regbankselect-fabs.mir | 2 +-
.../AMDGPU/GlobalISel/regbankselect-fneg.mir | 2 +-
.../GlobalISel/regbankselect-inttoptr.mir | 4 +-
.../GlobalISel/regbankselect-ptr-add.mir | 2 +-
.../GlobalISel/regbankselect-ptrtoint.mir | 4 +-
.../CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll | 859 +++---
.../AMDGPU/GlobalISel/selected-inst-flags.mir | 2 +-
.../shufflevector-pointer-crash.mir | 2 +-
.../CodeGen/AMDGPU/GlobalISel/srem.i32.ll | 753 ++---
.../CodeGen/AMDGPU/GlobalISel/udiv.i32.ll | 348 +--
.../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll | 2719 +++++++++--------
.../CodeGen/AMDGPU/GlobalISel/urem.i32.ll | 286 +-
.../CodeGen/AMDGPU/GlobalISel/urem.i64.ll | 2620 ++++++++--------
.../CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll | 2 +-
.../AMDGPU/lsr-void-inseltpoison.ll | 2 +-
.../LoopStrengthReduce/AMDGPU/lsr-void.ll | 2 +-
95 files changed, 4100 insertions(+), 4038 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-add-nullptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-add-nullptr.mir
index 8c707349c9766..2c7478fee8ebc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-add-nullptr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-add-nullptr.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: add_nullptr_shl_add
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir
index 1eb0b7de0692e..07010b206f4da 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-ashr-narrow.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: narrow_ashr_s64_32_s64amt
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir
index 0a2b3da7f7d94..55094eae07e0c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-extract-vector-load.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
# Tries to emit a foldable G_PTR_ADD with (p1, s32) operands.
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fabs-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fabs-fneg.mir
index 829d994a92297..83e4e9204b9cf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fabs-fneg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fabs-fneg.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: test_f16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
index 2d835a5d3ae01..a00ae041de05c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fcanonicalize.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: test_fcanonicalize
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsh.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsh.mir
index 4574d95e4eb81..51084d6906e04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsh.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsh.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: fshl_i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir
index c7fd9b846bc64..fac9729e8f029 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fsub-fneg.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: test_f16_poszero_nsz
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
index d6135d86022be..9a5c69cb92a70 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-itofp.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: uitofp_char_to_f32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir
index 17537f1d9a067..fd407bdbac315 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-lshr-narrow.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: narrow_lshr_s64_32_s64amt
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-redundant.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-redundant.mir
index fbe1b778c7bd8..b743b5e9f3026 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-redundant.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-redundant.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: test_const_const_1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-and.mir
index 79c1470f94cec..47fedd53a9c31 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-and.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-and.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: test_const_const
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-neg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-neg.mir
index b56808892d62e..6ece1c4ad300c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-neg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-redundant-neg.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: test_add_rhs
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rot.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rot.mir
index b22a59c3fef79..48f90922505c8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rot.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-rot.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: rotl_i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir
index 77d30f6fa5223..c4f94d42c102f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-amount-zext.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-regbank-combiner %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-regbank-combiner %s -o - | FileCheck %s
---
name: lshr_zext_i16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-illegal-types.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-illegal-types.mir
index 8a4c19b6d58a7..7d217362a98da 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-illegal-types.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-illegal-types.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: test_ashr_i44
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-shlsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-shlsat.mir
index 4f38e39404c99..3b1615a8034a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-shlsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain-shlsat.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: sshlsat_1
@@ -37,8 +37,8 @@ body: |
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
; CHECK-NEXT: [[SSHLSAT:%[0-9]+]]:_(s32) = G_SSHLSAT [[COPY]], [[C]](s32)
- ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SSHLSAT]](s32)
- ; CHECK-NEXT: $sgpr0 = COPY [[INT]](s32)
+ ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SSHLSAT]](s32)
+ ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32)
; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0
%0:_(s32) = COPY $sgpr0
%2:_(s32) = G_CONSTANT i32 1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll
index 2d3088f3edb72..5a68b2d83a607 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-imm-chain.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck %s
define amdgpu_cs i32 @test_shl_1(i32 inreg %arg1) {
; CHECK-LABEL: test_shl_1:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic-shlsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic-shlsat.mir
index 990d8302a37f0..a1af7df2f5a56 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic-shlsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic-shlsat.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: ushlsat_and_1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll
index 5532443c0dfc8..a7bb0b32f9dc1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shift-of-shifted-logic.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck %s
define amdgpu_cs i32 @test_shl_and_1(i32 inreg %arg1) {
; CHECK-LABEL: test_shl_and_1:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
index fd794bd7d9cf9..f122a774f9a57 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shifts.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s
---
name: combine_ashr
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir
index f939742ecba61..6423bea288591 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-narrow.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-trunc-shift.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-trunc-shift.mir
index df7fc56799137..7f2e04e11027b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-trunc-shift.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-trunc-shift.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
---
name: trunc_s32_shl_s64_5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
index 31907a6ee9656..6b1f087b5c02e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/dummy-target.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -amdgpu-enable-remove-incompatible-functions=0 -mtriple=amdgcn-amd-amdhsa -stop-after=legalizer -o - %s | FileCheck %s
+; RUN: llc -global-isel -amdgpu-enable-remove-incompatible-functions=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -stop-after=legalizer -o - %s | FileCheck %s
; Make sure legalizer info doesn't assert on dummy targets
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.s.barrier.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.s.barrier.mir
index e264baff40ad0..c91b2b90c6053 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.s.barrier.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.s.barrier.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.s.sendmsg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.s.sendmsg.mir
index 402f4db861e93..5bc1309914e8b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.s.sendmsg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.s.sendmsg.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
---
name: test_sendmsg
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbh-u32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbh-u32.mir
index f4e0c69dfb85b..7774b0e1861e5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbh-u32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbh-u32.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbl-b32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbl-b32.mir
index bf2f9367ae8e9..00d651c7ad762 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbl-b32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgpu-ffbl-b32.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir
index 18f392e5d5654..6703a1608a7a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-anyext.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN,SI
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN,SI
# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN,GFX10
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-br.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-br.mir
index 95b25012c64d1..d0531fb0e955a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-br.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-br.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir
index ecb07f79e9fd1..4b3311b09d17b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-brcond.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2> %t | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o - 2> %t | FileCheck -check-prefixes=GCN %s
# RUN: FileCheck -check-prefix=ERR %s < %t
# ERR-NOT: remark:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frame-index.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frame-index.mir
index a5c35b043d451..d9505d838baba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frame-index.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-frame-index.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
index 4bb9eb807e156..76b21e92903a2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GCN
---
name: g_phi_s32_ss_sbranch
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir
index d6994745749e7..238a8e4663d92 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sext.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir
index 065e5dd5b81bc..18af60d754fb8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-trunc.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir
index 2f034a001925d..dc55090b86061 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-zext.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GCN
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll
index 0317ec2c377a3..2da118d01e459 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgcn-sendmsg.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -O0 -stop-after=irtranslator -global-isel %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -O0 -stop-after=irtranslator -global-isel %s -o - | FileCheck %s
declare void @llvm.amdgcn.s.sendmsg(i32 immarg, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll
index f8a84bf12a2c6..0178bb3d8e240 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constantexpr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=irtranslator -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -stop-after=irtranslator -o - %s | FileCheck %s
@var = global i32 poison
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
index 4f360ef3c9f1e..9d014d9b2a480 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-constrained-fp.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -stop-after=irtranslator %s -o - | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 -stop-after=irtranslator %s -o - | FileCheck %s
define float @v_constained_fadd_f32_fpexcept_strict(float %x, float %y) #0 {
; CHECK-LABEL: name: v_constained_fadd_f32_fpexcept_strict
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll
index ee35e1c4e1f8e..446c9aba2f19e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fence.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=irtranslator < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -stop-after=irtranslator < %s | FileCheck %s
define amdgpu_kernel void @system_one_as_acquire() {
; CHECK-LABEL: name: system_one_as_acquire
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fixed-function-abi-vgpr-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fixed-function-abi-vgpr-args.ll
index 21db772536a62..90fbfb238cae3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fixed-function-abi-vgpr-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-fixed-function-abi-vgpr-args.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=irtranslator -o - %s | FileCheck --check-prefix=FIXED %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -stop-after=irtranslator -o - %s | FileCheck --check-prefix=FIXED %s
; Make sure arg1 is not allocated in v31, which is reserved for
; workitem IDs with -amdgpu-fixed-function-abi.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
index 2321cca252b83..44e3fc23d0580 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-getelementptr.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=irtranslator -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -stop-after=irtranslator -o - %s | FileCheck %s
; Test 64-bit pointer with 64-bit index
define <2 x ptr addrspace(1)> @vector_gep_v2p1_index_v2i64(<2 x ptr addrspace(1)> %ptr, <2 x i64> %idx) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
index b34d56b4f0a64..6d22eecdbbea2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-invariant.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -simplify-mir -global-isel -mtriple=amdgcn -stop-after=irtranslator %s -o - | FileCheck %s
+; RUN: llc -simplify-mir -global-isel -mtriple=amdgcn -mcpu=gfx600 -stop-after=irtranslator %s -o - | FileCheck %s
; Check the flags set on the memory operands for loads determined to
; be constants by alias analysis.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
index e4696094e40c6..f82d0b37a9609 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-memory-intrinsics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -O0 -stop-after=irtranslator %s -o - | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 -O0 -stop-after=irtranslator %s -o - | FileCheck %s
; Size operand should be the minimum of the two pointer sizes.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-prefetch.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-prefetch.ll
index f74a7e6e1e681..bc17fda89b31d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-prefetch.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -mtriple=amdgcn -stop-after=irtranslator < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 -stop-after=irtranslator < %s | FileCheck %s
define void @prefetch_read(ptr %ptr) {
; CHECK-LABEL: name: prefetch_read
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
index ffeb7c07a26d6..51bd38aecc0dd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-ptrmask.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -stop-after=irtranslator < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -stop-after=irtranslator < %s | FileCheck %s
define ptr @ptrmask_flat_i64(ptr %ptr, i64 %mask) {
; CHECK-LABEL: name: ptrmask_flat_i64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
index 00de01a694403..896fd1ce505ff 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-sat.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -global-isel -stop-after=irtranslator %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -global-isel -stop-after=irtranslator %s -o - | FileCheck %s
define i16 @uaddsat_i16(i16 %lhs, i16 %rhs) {
; CHECK-LABEL: name: uaddsat_i16
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-struct-return-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-struct-return-intrinsics.ll
index 01367a5fad447..1482e40545019 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-struct-return-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-struct-return-intrinsics.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -global-isel -stop-after=irtranslator -o - %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -global-isel -stop-after=irtranslator -o - %s | FileCheck %s
declare { float, i1 } @llvm.amdgcn.div.scale.f32(float, float, i1)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-zext-vec-index.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-zext-vec-index.ll
index eeaf8ee34eb9a..350f01e4c7165 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-zext-vec-index.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-zext-vec-index.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -mtriple=amdgcn -O0 -stop-after=irtranslator -global-isel %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -O0 -stop-after=irtranslator -global-isel %s -o - | FileCheck %s
define i8 @f_i1_1() {
; CHECK-LABEL: name: f_i1_1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir
index 4073568fd4210..5e288bb011d59 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -passes="print<gisel-value-tracking>" %s -filetype=null 2>&1 | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -passes="print<gisel-value-tracking>" %s -filetype=null 2>&1 | FileCheck %s
## Check that we don't incorrectly assume known zeroes for and extend of a truncated ptrtoint
## Test case for https://github.com/llvm/llvm-project/issues/139598
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
index 25652b69afa92..a86cb34ca3846 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-build-vector.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -O0 -run-pass=legalizer %s -o - | FileCheck %s
---
name: legal_v2s32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extractelement-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extractelement-crash.mir
index 805890a75d402..b0bfcdaf1eccc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extractelement-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-extractelement-crash.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
---
name: f
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir
index f46cc9b66fa15..35f6aaa6429b3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -run-pass=legalizer %s -o - | FileCheck %s
---
name: test_insert_s64_s32_offset0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-amdgcn-fdiv-fast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-amdgcn-fdiv-fast.mir
index 2a501ee8cdb1e..efbe5d10e82c0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-amdgcn-fdiv-fast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-intrinsic-amdgcn-fdiv-fast.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-mesa-mesa3d -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -run-pass=legalizer %s -o - | FileCheck %s
---
name: test_amdgcn_fdiv_fast
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-inttoptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-inttoptr.mir
index df13d7c50fc96..91b13a6d3ed72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-inttoptr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-inttoptr.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -O0 -run-pass=legalizer -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -O0 -run-pass=legalizer -o - %s | FileCheck %s
---
name: test_inttoptr_s64_to_p0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir
index 80737815cc16e..0b218e41950dd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-jump-table.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: not llc -mtriple=amdgcn -run-pass=legalizer -filetype=null %s 2>&1 | FileCheck %s
+# RUN: not llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=legalizer -filetype=null %s 2>&1 | FileCheck %s
# CHECK: LLVM ERROR: unable to legalize instruction: %3:_(p0) = G_JUMP_TABLE %jump-table.0 (in function: jt_test)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
index 4f5f52b25cdf7..701eea90296e4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpy.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-- -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
---
name: memcpy_test
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
index 0392aef6fe030..4f463b90aaf62 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memcpyinline.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-- -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
---
name: memcpyinline_test
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
index 1f8d1aac24ebb..c3ea63a694b6e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memmove.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-- -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
---
name: memmove_test
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
index dda94e1550585..9f58a0fbaa9c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memset.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-- -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
---
name: memset_test
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memsetinline.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memsetinline.mir
index e689bc107a07b..3b402d556201a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memsetinline.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-memsetinline.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-- -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 -run-pass=legalizer -verify-machineinstrs -o - %s | FileCheck %s
---
name: memsetinline_test
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir
index 123454a26af2b..1457e50eccbb2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-merge-values-build-vector.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-- -O0 -run-pass=legalizer -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 -O0 -run-pass=legalizer -o - %s | FileCheck %s
---
name: test_merge_s32_s32_s64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrtoint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrtoint.mir
index 59945720ebab5..7e55f92f8c907 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrtoint.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ptrtoint.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck %s
---
name: test_ptrtoint_p0_to_s64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
index 16ba8c295c047..fb9c18b965209 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-unmerge-values.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn-- -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 -O0 -run-pass=legalizer -global-isel-abort=0 -o - %s | FileCheck %s
---
name: test_unmerge_s32_s64
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
index 7c0484bf3f317..c543f5f9bda09 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.inline.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
declare void @llvm.memcpy.inline.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
index d302570de9c33..311c691b907b5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memcpy.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=gfx600 -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=gfx600 -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memcpy.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1 immarg)
@@ -12,191 +12,174 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
; LOOP-NEXT: s_mov_b32 s3, 0xf000
; LOOP-NEXT: .LBB0_1: ; %static-memcpy-expansion-main-body
; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
-; LOOP-NEXT: s_waitcnt expcnt(1)
-; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64
-; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT: buffer_load_ubyte v7, v[2:3], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT: buffer_load_ubyte v8, v[2:3], s[0:3], 0 addr64 offset:4
-; LOOP-NEXT: buffer_load_ubyte v9, v[2:3], s[0:3], 0 addr64 offset:5
-; LOOP-NEXT: buffer_load_ubyte v10, v[2:3], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:1
+; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64
+; LOOP-NEXT: s_waitcnt expcnt(5)
+; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:3
+; LOOP-NEXT: s_waitcnt expcnt(3)
+; LOOP-NEXT: buffer_load_ubyte v7, v[2:3], s[0:3], 0 addr64 offset:2
+; LOOP-NEXT: buffer_load_ubyte v8, v[2:3], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT: buffer_load_ubyte v9, v[2:3], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT: buffer_load_ubyte v10, v[2:3], s[0:3], 0 addr64 offset:7
; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: buffer_load_ubyte v11, v[2:3], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT: buffer_load_ubyte v12, v[2:3], s[0:3], 0 addr64 offset:8
-; LOOP-NEXT: s_waitcnt vmcnt(7)
-; LOOP-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; LOOP-NEXT: v_or_b32_e32 v4, v5, v4
-; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT: s_waitcnt vmcnt(6)
-; LOOP-NEXT: v_lshlrev_b32_e32 v7, 24, v7
-; LOOP-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; LOOP-NEXT: v_or_b32_e32 v6, v7, v6
-; LOOP-NEXT: buffer_load_ubyte v7, v[2:3], s[0:3], 0 addr64 offset:10
-; LOOP-NEXT: s_waitcnt vmcnt(5)
-; LOOP-NEXT: v_lshlrev_b32_e32 v9, 8, v9
-; LOOP-NEXT: v_or_b32_e32 v8, v9, v8
-; LOOP-NEXT: buffer_load_ubyte v9, v[2:3], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT: s_waitcnt vmcnt(4)
-; LOOP-NEXT: v_lshlrev_b32_e32 v11, 24, v11
-; LOOP-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; LOOP-NEXT: v_or_b32_e32 v10, v11, v10
-; LOOP-NEXT: buffer_load_ubyte v11, v[2:3], s[0:3], 0 addr64 offset:12
-; LOOP-NEXT: s_waitcnt vmcnt(3)
-; LOOP-NEXT: v_lshlrev_b32_e32 v5, 8, v5
-; LOOP-NEXT: v_or_b32_e32 v5, v5, v12
-; LOOP-NEXT: buffer_load_ubyte v12, v[2:3], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT: s_waitcnt vmcnt(2)
-; LOOP-NEXT: v_lshlrev_b32_e32 v9, 24, v9
+; LOOP-NEXT: buffer_load_ubyte v11, v[2:3], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT: buffer_load_ubyte v12, v[2:3], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT: buffer_load_ubyte v13, v[2:3], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT: buffer_load_ubyte v14, v[2:3], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT: buffer_load_ubyte v15, v[2:3], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT: buffer_load_ubyte v16, v[2:3], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT: buffer_load_ubyte v17, v[2:3], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT: buffer_load_ubyte v18, v[2:3], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT: buffer_load_ubyte v19, v[2:3], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: v_lshlrev_b32_e32 v4, 8, v4
+; LOOP-NEXT: v_or_b32_e32 v4, v4, v5
+; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v6
+; LOOP-NEXT: s_waitcnt vmcnt(12)
+; LOOP-NEXT: v_lshlrev_b32_e32 v8, 8, v8
; LOOP-NEXT: v_lshlrev_b32_e32 v7, 16, v7
-; LOOP-NEXT: v_or_b32_e32 v7, v9, v7
-; LOOP-NEXT: buffer_load_ubyte v9, v[2:3], s[0:3], 0 addr64 offset:15
-; LOOP-NEXT: buffer_load_ubyte v13, v[2:3], s[0:3], 0 addr64 offset:14
-; LOOP-NEXT: s_waitcnt vmcnt(2)
+; LOOP-NEXT: s_waitcnt vmcnt(10)
+; LOOP-NEXT: v_lshlrev_b32_e32 v10, 24, v10
+; LOOP-NEXT: s_waitcnt vmcnt(9)
+; LOOP-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; LOOP-NEXT: s_waitcnt vmcnt(8)
; LOOP-NEXT: v_lshlrev_b32_e32 v12, 8, v12
-; LOOP-NEXT: v_or_b32_e32 v11, v12, v11
-; LOOP-NEXT: s_waitcnt vmcnt(1)
-; LOOP-NEXT: v_lshlrev_b32_e32 v9, 24, v9
-; LOOP-NEXT: s_waitcnt vmcnt(0)
-; LOOP-NEXT: v_lshlrev_b32_e32 v12, 16, v13
-; LOOP-NEXT: v_or_b32_e32 v9, v9, v12
-; LOOP-NEXT: buffer_load_ubyte v12, v[2:3], s[0:3], 0 addr64 offset:17
-; LOOP-NEXT: buffer_load_ubyte v13, v[2:3], s[0:3], 0 addr64 offset:16
-; LOOP-NEXT: buffer_load_ubyte v14, v[2:3], s[0:3], 0 addr64 offset:19
-; LOOP-NEXT: buffer_load_ubyte v15, v[2:3], s[0:3], 0 addr64 offset:18
-; LOOP-NEXT: s_waitcnt vmcnt(3)
-; LOOP-NEXT: v_lshlrev_b32_e32 v12, 8, v12
-; LOOP-NEXT: s_waitcnt vmcnt(2)
+; LOOP-NEXT: v_or_b32_e32 v8, v8, v9
+; LOOP-NEXT: s_waitcnt vmcnt(6)
+; LOOP-NEXT: v_lshlrev_b32_e32 v14, 24, v14
+; LOOP-NEXT: s_waitcnt vmcnt(5)
+; LOOP-NEXT: v_lshlrev_b32_e32 v15, 16, v15
+; LOOP-NEXT: buffer_load_ubyte v9, v[2:3], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT: v_or_b32_e32 v10, v10, v11
+; LOOP-NEXT: buffer_load_ubyte v11, v[2:3], s[0:3], 0 addr64 offset:19
; LOOP-NEXT: v_or_b32_e32 v12, v12, v13
-; LOOP-NEXT: s_waitcnt vmcnt(1)
-; LOOP-NEXT: v_lshlrev_b32_e32 v13, 24, v14
-; LOOP-NEXT: s_waitcnt vmcnt(0)
-; LOOP-NEXT: v_lshlrev_b32_e32 v14, 16, v15
-; LOOP-NEXT: v_or_b32_e32 v13, v13, v14
-; LOOP-NEXT: buffer_load_ubyte v14, v[2:3], s[0:3], 0 addr64 offset:21
-; LOOP-NEXT: buffer_load_ubyte v15, v[2:3], s[0:3], 0 addr64 offset:20
-; LOOP-NEXT: buffer_load_ubyte v16, v[2:3], s[0:3], 0 addr64 offset:23
-; LOOP-NEXT: buffer_load_ubyte v17, v[2:3], s[0:3], 0 addr64 offset:22
-; LOOP-NEXT: s_waitcnt vmcnt(3)
-; LOOP-NEXT: v_lshlrev_b32_e32 v14, 8, v14
-; LOOP-NEXT: s_waitcnt vmcnt(2)
+; LOOP-NEXT: buffer_load_ubyte v13, v[2:3], s[0:3], 0 addr64 offset:20
; LOOP-NEXT: v_or_b32_e32 v14, v14, v15
-; LOOP-NEXT: s_waitcnt vmcnt(1)
-; LOOP-NEXT: v_lshlrev_b32_e32 v15, 24, v16
-; LOOP-NEXT: s_waitcnt vmcnt(0)
-; LOOP-NEXT: v_lshlrev_b32_e32 v16, 16, v17
-; LOOP-NEXT: v_or_b32_e32 v15, v15, v16
-; LOOP-NEXT: buffer_load_ubyte v16, v[2:3], s[0:3], 0 addr64 offset:25
-; LOOP-NEXT: buffer_load_ubyte v17, v[2:3], s[0:3], 0 addr64 offset:24
-; LOOP-NEXT: buffer_load_ubyte v18, v[2:3], s[0:3], 0 addr64 offset:27
-; LOOP-NEXT: buffer_load_ubyte v19, v[2:3], s[0:3], 0 addr64 offset:26
-; LOOP-NEXT: s_waitcnt vmcnt(3)
+; LOOP-NEXT: buffer_load_ubyte v15, v[2:3], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT: s_waitcnt vmcnt(8)
; LOOP-NEXT: v_lshlrev_b32_e32 v16, 8, v16
-; LOOP-NEXT: s_waitcnt vmcnt(2)
+; LOOP-NEXT: s_waitcnt vmcnt(6)
+; LOOP-NEXT: v_lshlrev_b32_e32 v18, 24, v18
+; LOOP-NEXT: s_waitcnt vmcnt(5)
+; LOOP-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; LOOP-NEXT: v_or_b32_e32 v6, v6, v7
; LOOP-NEXT: v_or_b32_e32 v16, v16, v17
-; LOOP-NEXT: s_waitcnt vmcnt(1)
-; LOOP-NEXT: v_lshlrev_b32_e32 v17, 24, v18
-; LOOP-NEXT: s_waitcnt vmcnt(0)
-; LOOP-NEXT: v_lshlrev_b32_e32 v18, 16, v19
-; LOOP-NEXT: v_or_b32_e32 v17, v17, v18
-; LOOP-NEXT: buffer_load_ubyte v18, v[2:3], s[0:3], 0 addr64 offset:29
-; LOOP-NEXT: buffer_load_ubyte v19, v[2:3], s[0:3], 0 addr64 offset:28
-; LOOP-NEXT: buffer_load_ubyte v20, v[2:3], s[0:3], 0 addr64 offset:31
-; LOOP-NEXT: buffer_load_ubyte v21, v[2:3], s[0:3], 0 addr64 offset:30
-; LOOP-NEXT: s_waitcnt vmcnt(3)
-; LOOP-NEXT: v_lshlrev_b32_e32 v18, 8, v18
-; LOOP-NEXT: s_waitcnt vmcnt(2)
+; LOOP-NEXT: buffer_load_ubyte v17, v[2:3], s[0:3], 0 addr64 offset:23
; LOOP-NEXT: v_or_b32_e32 v18, v18, v19
-; LOOP-NEXT: s_waitcnt vmcnt(1)
-; LOOP-NEXT: v_lshlrev_b32_e32 v19, 24, v20
-; LOOP-NEXT: s_waitcnt vmcnt(0)
-; LOOP-NEXT: v_lshlrev_b32_e32 v20, 16, v21
-; LOOP-NEXT: v_or_b32_e32 v19, v19, v20
+; LOOP-NEXT: buffer_load_ubyte v19, v[2:3], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT: buffer_load_ubyte v7, v[2:3], s[0:3], 0 addr64 offset:16
; LOOP-NEXT: v_or_b32_e32 v4, v6, v4
; LOOP-NEXT: v_or_b32_e32 v6, v10, v8
-; LOOP-NEXT: v_or_b32_e32 v5, v7, v5
-; LOOP-NEXT: v_or_b32_e32 v7, v9, v11
-; LOOP-NEXT: v_or_b32_e32 v8, v13, v12
-; LOOP-NEXT: v_or_b32_e32 v9, v15, v14
-; LOOP-NEXT: v_or_b32_e32 v10, v17, v16
-; LOOP-NEXT: v_or_b32_e32 v11, v19, v18
-; LOOP-NEXT: v_bfe_u32 v12, v4, 8, 8
-; LOOP-NEXT: buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:1
-; LOOP-NEXT: s_waitcnt expcnt(0)
+; LOOP-NEXT: v_or_b32_e32 v8, v14, v12
+; LOOP-NEXT: v_or_b32_e32 v10, v18, v16
; LOOP-NEXT: v_lshrrev_b32_e32 v12, 16, v4
+; LOOP-NEXT: v_lshrrev_b32_e32 v14, 16, v6
+; LOOP-NEXT: v_lshrrev_b32_e32 v16, 16, v8
+; LOOP-NEXT: v_lshrrev_b32_e32 v18, 16, v10
+; LOOP-NEXT: s_waitcnt vmcnt(7)
+; LOOP-NEXT: v_lshlrev_b32_e32 v5, 8, v5
+; LOOP-NEXT: s_waitcnt vmcnt(6)
+; LOOP-NEXT: v_lshlrev_b32_e32 v9, 16, v9
+; LOOP-NEXT: s_waitcnt vmcnt(5)
+; LOOP-NEXT: v_lshlrev_b32_e32 v11, 24, v11
+; LOOP-NEXT: v_or_b32_e32 v9, v11, v9
+; LOOP-NEXT: buffer_load_ubyte v11, v[2:3], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT: s_waitcnt vmcnt(4)
+; LOOP-NEXT: v_lshlrev_b32_e32 v15, 8, v15
+; LOOP-NEXT: v_or_b32_e32 v13, v15, v13
+; LOOP-NEXT: buffer_load_ubyte v15, v[2:3], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT: buffer_load_ubyte v23, v[2:3], s[0:3], 0 addr64 offset:24
+; LOOP-NEXT: s_waitcnt vmcnt(5)
+; LOOP-NEXT: v_lshlrev_b32_e32 v17, 24, v17
+; LOOP-NEXT: s_waitcnt vmcnt(4)
+; LOOP-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; LOOP-NEXT: s_waitcnt vmcnt(3)
+; LOOP-NEXT: v_or_b32_e32 v5, v5, v7
+; LOOP-NEXT: buffer_load_ubyte v7, v[2:3], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT: v_or_b32_e32 v17, v17, v19
+; LOOP-NEXT: buffer_load_ubyte v19, v[2:3], s[0:3], 0 addr64 offset:28
+; LOOP-NEXT: buffer_load_ubyte v20, v[2:3], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT: buffer_load_ubyte v21, v[2:3], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT: buffer_load_ubyte v22, v[2:3], s[0:3], 0 addr64 offset:31
+; LOOP-NEXT: v_or_b32_e32 v5, v9, v5
+; LOOP-NEXT: v_or_b32_e32 v9, v17, v13
+; LOOP-NEXT: v_bfe_u32 v13, v4, 8, 8
; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v4, 24, v4
-; LOOP-NEXT: buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:2
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v12, 16, v6
-; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:3
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_bfe_u32 v4, v6, 8, 8
; LOOP-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:4
+; LOOP-NEXT: v_bfe_u32 v17, v8, 8, 8
+; LOOP-NEXT: buffer_store_byte v8, v[0:1], s[0:3], 0 addr64 offset:8
; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v6, 24, v6
-; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:5
+; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v8
+; LOOP-NEXT: buffer_store_byte v10, v[0:1], s[0:3], 0 addr64 offset:12
+; LOOP-NEXT: buffer_store_byte v13, v[0:1], s[0:3], 0 addr64 offset:1
; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v4, 16, v5
-; LOOP-NEXT: buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT: v_lshrrev_b32_e32 v13, 16, v5
+; LOOP-NEXT: buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:2
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_bfe_u32 v12, v5, 8, 8
-; LOOP-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:8
+; LOOP-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:16
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v5, 24, v5
-; LOOP-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:7
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v6, 16, v7
-; LOOP-NEXT: buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:9
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_bfe_u32 v12, v7, 8, 8
-; LOOP-NEXT: buffer_store_byte v7, v[0:1], s[0:3], 0 addr64 offset:12
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v7, 24, v7
-; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:10
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v4, 16, v8
-; LOOP-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:11
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_bfe_u32 v5, v8, 8, 8
-; LOOP-NEXT: buffer_store_byte v8, v[0:1], s[0:3], 0 addr64 offset:16
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v8, 24, v8
-; LOOP-NEXT: buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:13
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v12, 16, v9
-; LOOP-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:3
; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_bfe_u32 v6, v9, 8, 8
+; LOOP-NEXT: v_lshrrev_b32_e32 v4, 16, v9
; LOOP-NEXT: buffer_store_byte v9, v[0:1], s[0:3], 0 addr64 offset:20
+; LOOP-NEXT: buffer_store_byte v14, v[0:1], s[0:3], 0 addr64 offset:6
+; LOOP-NEXT: buffer_store_byte v17, v[0:1], s[0:3], 0 addr64 offset:9
+; LOOP-NEXT: buffer_store_byte v16, v[0:1], s[0:3], 0 addr64 offset:10
+; LOOP-NEXT: s_waitcnt vmcnt(14)
+; LOOP-NEXT: v_lshlrev_b32_e32 v11, 16, v11
+; LOOP-NEXT: v_lshlrev_b32_e32 v15, 24, v15
+; LOOP-NEXT: v_or_b32_e32 v11, v15, v11
+; LOOP-NEXT: v_lshlrev_b32_e32 v15, 8, v20
+; LOOP-NEXT: v_lshlrev_b32_e32 v7, 8, v7
+; LOOP-NEXT: v_or_b32_e32 v15, v15, v19
+; LOOP-NEXT: s_waitcnt vmcnt(12)
+; LOOP-NEXT: v_lshlrev_b32_e32 v19, 24, v22
+; LOOP-NEXT: v_lshlrev_b32_e32 v20, 16, v21
+; LOOP-NEXT: v_or_b32_e32 v7, v7, v23
+; LOOP-NEXT: v_or_b32_e32 v19, v19, v20
+; LOOP-NEXT: v_or_b32_e32 v7, v11, v7
+; LOOP-NEXT: v_or_b32_e32 v11, v19, v15
+; LOOP-NEXT: v_bfe_u32 v15, v6, 8, 8
+; LOOP-NEXT: v_lshrrev_b32_e32 v6, 24, v6
+; LOOP-NEXT: v_bfe_u32 v19, v10, 8, 8
+; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v10
+; LOOP-NEXT: buffer_store_byte v15, v[0:1], s[0:3], 0 addr64 offset:5
; LOOP-NEXT: s_waitcnt expcnt(0)
+; LOOP-NEXT: v_bfe_u32 v15, v9, 8, 8
; LOOP-NEXT: v_lshrrev_b32_e32 v9, 24, v9
-; LOOP-NEXT: buffer_store_byte v7, v[0:1], s[0:3], 0 addr64 offset:15
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v7, 16, v10
-; LOOP-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:17
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_bfe_u32 v5, v10, 8, 8
-; LOOP-NEXT: buffer_store_byte v10, v[0:1], s[0:3], 0 addr64 offset:24
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v10, 24, v10
-; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT: v_lshrrev_b32_e32 v14, 16, v7
+; LOOP-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:7
; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v4, 16, v11
-; LOOP-NEXT: buffer_store_byte v8, v[0:1], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT: v_bfe_u32 v6, v7, 8, 8
+; LOOP-NEXT: buffer_store_byte v7, v[0:1], s[0:3], 0 addr64 offset:24
; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_bfe_u32 v8, v11, 8, 8
+; LOOP-NEXT: v_lshrrev_b32_e32 v7, 24, v7
+; LOOP-NEXT: v_lshrrev_b32_e32 v17, 16, v11
+; LOOP-NEXT: v_bfe_u32 v16, v11, 8, 8
; LOOP-NEXT: buffer_store_byte v11, v[0:1], s[0:3], 0 addr64 offset:28
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v11, 24, v11
-; LOOP-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:21
-; LOOP-NEXT: buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:22
+; LOOP-NEXT: buffer_store_byte v8, v[0:1], s[0:3], 0 addr64 offset:11
+; LOOP-NEXT: buffer_store_byte v19, v[0:1], s[0:3], 0 addr64 offset:13
+; LOOP-NEXT: buffer_store_byte v18, v[0:1], s[0:3], 0 addr64 offset:14
+; LOOP-NEXT: buffer_store_byte v10, v[0:1], s[0:3], 0 addr64 offset:15
+; LOOP-NEXT: buffer_store_byte v12, v[0:1], s[0:3], 0 addr64 offset:17
+; LOOP-NEXT: buffer_store_byte v13, v[0:1], s[0:3], 0 addr64 offset:18
+; LOOP-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:19
+; LOOP-NEXT: buffer_store_byte v15, v[0:1], s[0:3], 0 addr64 offset:21
+; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:22
; LOOP-NEXT: buffer_store_byte v9, v[0:1], s[0:3], 0 addr64 offset:23
-; LOOP-NEXT: buffer_store_byte v5, v[0:1], s[0:3], 0 addr64 offset:25
-; LOOP-NEXT: buffer_store_byte v7, v[0:1], s[0:3], 0 addr64 offset:26
-; LOOP-NEXT: buffer_store_byte v10, v[0:1], s[0:3], 0 addr64 offset:27
-; LOOP-NEXT: buffer_store_byte v8, v[0:1], s[0:3], 0 addr64 offset:29
-; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:30
+; LOOP-NEXT: buffer_store_byte v6, v[0:1], s[0:3], 0 addr64 offset:25
+; LOOP-NEXT: buffer_store_byte v14, v[0:1], s[0:3], 0 addr64 offset:26
+; LOOP-NEXT: buffer_store_byte v7, v[0:1], s[0:3], 0 addr64 offset:27
+; LOOP-NEXT: buffer_store_byte v16, v[0:1], s[0:3], 0 addr64 offset:29
+; LOOP-NEXT: buffer_store_byte v17, v[0:1], s[0:3], 0 addr64 offset:30
; LOOP-NEXT: buffer_store_byte v11, v[0:1], s[0:3], 0 addr64 offset:31
; LOOP-NEXT: s_add_u32 s0, s0, 32
; LOOP-NEXT: s_addc_u32 s1, s1, 0
@@ -206,9 +189,9 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
; LOOP-NEXT: s_mov_b32 s2, 0
; LOOP-NEXT: s_mov_b32 s3, 0xf000
; LOOP-NEXT: s_mov_b64 s[0:1], 0
-; LOOP-NEXT: s_waitcnt expcnt(1)
; LOOP-NEXT: buffer_load_ubyte v4, v[2:3], s[0:3], 0 addr64 offset:33
; LOOP-NEXT: buffer_load_ubyte v5, v[2:3], s[0:3], 0 addr64 offset:35
+; LOOP-NEXT: s_waitcnt expcnt(5)
; LOOP-NEXT: buffer_load_ubyte v6, v[2:3], s[0:3], 0 addr64 offset:34
; LOOP-NEXT: buffer_load_ubyte v2, v[2:3], s[0:3], 0 addr64 offset:32
; LOOP-NEXT: s_waitcnt vmcnt(3)
@@ -221,8 +204,8 @@ define amdgpu_cs void @memcpy_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src)
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
; LOOP-NEXT: v_or_b32_e32 v3, v4, v5
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
-; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:32
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
index d5cd7c9b3c58a..1169c1bbbc60c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memmove.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=gfx600 -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=gfx600 -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memmove.p1.p1.i32(ptr addrspace(1), ptr addrspace(1), i32, i1)
@@ -66,8 +66,8 @@ define amdgpu_cs void @memmove_p1i8(ptr addrspace(1) %dst, ptr addrspace(1) %src
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
; LOOP-NEXT: v_or_b32_e32 v3, v4, v5
; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
-; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8
+; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64
; LOOP-NEXT: s_waitcnt expcnt(0)
; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.inline.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.inline.ll
index cb2573a354ca1..687c0d1a90ff2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.inline.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.inline.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
-; RUN: llc -global-isel -mtriple=amdgcn -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 -mem-intrinsic-expand-size=3 %s -o - | FileCheck -check-prefix=GCN %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 -mem-intrinsic-expand-size=5 %s -o - | FileCheck -check-prefix=GCN %s
declare void @llvm.memset.inline.p1.i32(ptr addrspace(1), i8, i32, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
index 211ec5b288a78..e62b735f95458 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.memset.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
-; RUN: llc -global-isel -mtriple=amdgcn-- -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=gfx600 -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=35 %s -o - | FileCheck -check-prefix=LOOP %s
+; RUN: llc -global-isel -mtriple=amdgcn-- -mcpu=gfx600 -amdgpu-memcpy-loop-unroll=2 -mem-intrinsic-expand-size=37 %s -o - | FileCheck -check-prefix=UNROLL %s
declare void @llvm.memset.p1.i32(ptr addrspace(1), i8, i32, i1)
@@ -8,17 +8,17 @@ define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
; LOOP-LABEL: memset_p1i8:
; LOOP: ; %bb.0:
; LOOP-NEXT: v_and_b32_e32 v3, 0xff, v2
-; LOOP-NEXT: s_mov_b64 s[0:1], 0
-; LOOP-NEXT: s_mov_b32 s2, 0
-; LOOP-NEXT: s_mov_b32 s3, 0xf000
; LOOP-NEXT: v_lshlrev_b32_e32 v4, 8, v3
+; LOOP-NEXT: v_or_b32_e32 v4, v3, v4
; LOOP-NEXT: v_lshlrev_b32_e32 v5, 16, v3
-; LOOP-NEXT: v_lshlrev_b32_e32 v6, 24, v3
-; LOOP-NEXT: v_or_b32_e32 v3, v3, v4
-; LOOP-NEXT: v_or_b32_e32 v3, v3, v5
-; LOOP-NEXT: v_or_b32_e32 v3, v3, v6
+; LOOP-NEXT: v_or_b32_e32 v4, v4, v5
+; LOOP-NEXT: v_lshlrev_b32_e32 v3, 24, v3
+; LOOP-NEXT: v_or_b32_e32 v3, v4, v3
+; LOOP-NEXT: s_mov_b64 s[0:1], 0
; LOOP-NEXT: v_lshrrev_b32_e32 v4, 16, v3
; LOOP-NEXT: v_bfe_u32 v5, v3, 8, 8
+; LOOP-NEXT: s_mov_b32 s2, 0
+; LOOP-NEXT: s_mov_b32 s3, 0xf000
; LOOP-NEXT: v_lshrrev_b32_e32 v6, 24, v3
; LOOP-NEXT: .LBB0_1: ; %static-memset-expansion-main-body
; LOOP-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -60,23 +60,23 @@ define amdgpu_cs void @memset_p1i8(ptr addrspace(1) %dst, i8 %val) {
; LOOP-NEXT: s_cbranch_scc1 .LBB0_1
; LOOP-NEXT: ; %bb.2: ; %static-memset-post-expansion
; LOOP-NEXT: v_and_b32_e32 v2, 0xff, v2
-; LOOP-NEXT: s_mov_b32 s2, 0
-; LOOP-NEXT: s_mov_b32 s3, 0xf000
-; LOOP-NEXT: s_mov_b64 s[0:1], 0
; LOOP-NEXT: s_waitcnt expcnt(3)
; LOOP-NEXT: v_lshlrev_b32_e32 v3, 8, v2
+; LOOP-NEXT: v_or_b32_e32 v3, v2, v3
; LOOP-NEXT: s_waitcnt expcnt(1)
; LOOP-NEXT: v_lshlrev_b32_e32 v4, 16, v2
-; LOOP-NEXT: v_lshlrev_b32_e32 v5, 24, v2
-; LOOP-NEXT: v_or_b32_e32 v2, v2, v3
-; LOOP-NEXT: v_or_b32_e32 v2, v2, v4
-; LOOP-NEXT: v_or_b32_e32 v2, v2, v5
+; LOOP-NEXT: v_or_b32_e32 v3, v3, v4
+; LOOP-NEXT: v_lshlrev_b32_e32 v2, 24, v2
+; LOOP-NEXT: v_or_b32_e32 v2, v3, v2
; LOOP-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; LOOP-NEXT: s_mov_b32 s2, 0
+; LOOP-NEXT: s_mov_b32 s3, 0xf000
+; LOOP-NEXT: s_mov_b64 s[0:1], 0
; LOOP-NEXT: v_bfe_u32 v4, v2, 8, 8
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:32
-; LOOP-NEXT: s_waitcnt expcnt(0)
-; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; LOOP-NEXT: buffer_store_byte v4, v[0:1], s[0:3], 0 addr64 offset:33
+; LOOP-NEXT: s_waitcnt expcnt(1)
+; LOOP-NEXT: v_lshrrev_b32_e32 v2, 24, v2
; LOOP-NEXT: buffer_store_byte v3, v[0:1], s[0:3], 0 addr64 offset:34
; LOOP-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 offset:35
; LOOP-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
index bf9b27d1e25a6..4e931246c101a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/merge-buffer-stores.ll
@@ -1,15 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx600 -o - %s | FileCheck %s
define amdgpu_cs void @test1(i32 %arg1, <4 x i32> inreg %arg2, i32, ptr addrspace(6) inreg %arg3) {
; CHECK-LABEL: test1:
; CHECK: ; %bb.0: ; %.entry
-; CHECK-NEXT: v_and_b32_e32 v3, 0x3ffffffc, v0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ffffffc, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, s4, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 11
; CHECK-NEXT: v_mov_b32_e32 v1, 22
; CHECK-NEXT: v_mov_b32_e32 v2, 33
-; CHECK-NEXT: v_lshlrev_b32_e32 v3, 2, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, s4, v3
; CHECK-NEXT: v_mov_b32_e32 v3, 44
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
; CHECK-NEXT: s_endpgm
@@ -40,12 +40,12 @@ define amdgpu_cs void @test1(i32 %arg1, <4 x i32> inreg %arg2, i32, ptr addrspac
define amdgpu_cs void @test1_ptr(i32 %arg1, ptr addrspace(8) inreg %arg2, i32, ptr addrspace(6) inreg %arg3) {
; CHECK-LABEL: test1_ptr:
; CHECK: ; %bb.0: ; %.entry
-; CHECK-NEXT: v_and_b32_e32 v3, 0x3ffffffc, v0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ffffffc, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, s4, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 11
; CHECK-NEXT: v_mov_b32_e32 v1, 22
; CHECK-NEXT: v_mov_b32_e32 v2, 33
-; CHECK-NEXT: v_lshlrev_b32_e32 v3, 2, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, s4, v3
; CHECK-NEXT: v_mov_b32_e32 v3, 44
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
; CHECK-NEXT: s_endpgm
@@ -76,11 +76,11 @@ define amdgpu_cs void @test1_ptr(i32 %arg1, ptr addrspace(8) inreg %arg2, i32, p
define amdgpu_cs void @test2(i32 %arg1, <4 x i32> inreg %arg2) {
; CHECK-LABEL: test2:
; CHECK: ; %bb.0: ; %.entry
-; CHECK-NEXT: v_and_b32_e32 v3, 0x3ffffffc, v0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ffffffc, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 11
; CHECK-NEXT: v_mov_b32_e32 v1, 22
; CHECK-NEXT: v_mov_b32_e32 v2, 33
-; CHECK-NEXT: v_lshlrev_b32_e32 v4, 2, v3
; CHECK-NEXT: v_mov_b32_e32 v3, 44
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
; CHECK-NEXT: s_endpgm
@@ -111,11 +111,11 @@ define amdgpu_cs void @test2(i32 %arg1, <4 x i32> inreg %arg2) {
define amdgpu_cs void @test2_ptr(i32 %arg1, ptr addrspace(8) inreg %arg2) {
; CHECK-LABEL: test2_ptr:
; CHECK: ; %bb.0: ; %.entry
-; CHECK-NEXT: v_and_b32_e32 v3, 0x3ffffffc, v0
+; CHECK-NEXT: v_and_b32_e32 v0, 0x3ffffffc, v0
+; CHECK-NEXT: v_lshlrev_b32_e32 v4, 2, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 11
; CHECK-NEXT: v_mov_b32_e32 v1, 22
; CHECK-NEXT: v_mov_b32_e32 v2, 33
-; CHECK-NEXT: v_lshlrev_b32_e32 v4, 2, v3
; CHECK-NEXT: v_mov_b32_e32 v3, 44
; CHECK-NEXT: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll
index 0f84f12a12574..4203821d7a35e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx700 < %s | FileCheck %s
declare i32 @llvm.umin.i32(i32, i32)
declare i32 @llvm.umax.i32(i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-reassoc.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-reassoc.mir
index 25aa329eabc47..f6071c18f9178 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-reassoc.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-reassoc.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs -o - %s | FileCheck %s
---
name: test_reassoc_infinite_loop
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir
index 047452fa97140..f007e0cb57044 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-select.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
---
name: select_from_different_results_of_unmerge_values
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index 7c0265d89ece1..aa881ac2d413a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck -check-prefix=FAST %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck -check-prefix=GREEDY %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck -check-prefix=FAST %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck -check-prefix=GREEDY %s
; Natural mapping
define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
index 152133692e6bb..012bd6e9524f8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll
index 423f573a5c1de..b63982a4183e6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.ptr.buffer.load.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @raw_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
index 4d6e4a5074630..c40ccabb83123 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
index e0b5386181182..8850217889124 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, float %val, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll
index 88c0069962fae..c6875f5d0b1ed 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.load.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -regbankselect-fast -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -regbankselect-greedy -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps float @struct_ptr_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll
index 3ed56fffa7354..1b36ceb87362a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.ptr.buffer.store.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -stop-after=amdgpu-regbanklegalize -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx600 -stop-after=amdgpu-regbanklegalize -o - %s | FileCheck %s
; Natural mapping
define amdgpu_ps void @struct_ptr_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, float %val, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fabs.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fabs.mir
index 26ee9dce8dbd9..0c32319a04837 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fabs.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fabs.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' %s -verify-machineinstrs -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' %s -verify-machineinstrs -o - | FileCheck %s
---
name: fabs_s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fneg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fneg.mir
index a2a3d85b41e1c..d12235a3c3064 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fneg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-fneg.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' %s -verify-machineinstrs -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass='amdgpu-regbankselect,amdgpu-regbanklegalize' %s -verify-machineinstrs -o - | FileCheck %s
---
name: fneg_s
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir
index aec68e5a5b713..72d575e1e5598 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-inttoptr.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir
index 57b7a822ed7aa..cafc0ffed0f78 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptr-add.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s
---
name: gep_p0_s_k
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
index 31cbae9ab47ff..de232db6464c2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-ptrtoint.mir
@@ -1,6 +1,6 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
-# RUN: llc -mtriple=amdgcn -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-fast | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=regbankselect %s -verify-machineinstrs -o - -regbankselect-greedy | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s
---
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 070335fbac64b..552bc2bf465d6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,GISEL %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,CGP %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 < %s | FileCheck -check-prefixes=CHECK,GISEL %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 < %s | FileCheck -check-prefixes=CHECK,CGP %s
; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
@@ -8,67 +8,67 @@ define i32 @v_sdiv_i32(i32 %num, i32 %den) {
; GISEL-LABEL: v_sdiv_i32:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v1
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v1
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v2
+; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GISEL-NEXT: v_xor_b32_e32 v2, v5, v2
+; GISEL-NEXT: v_mul_lo_u32 v4, v4, v3
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; GISEL-NEXT: v_mul_hi_u32 v3, v0, v3
+; GISEL-NEXT: v_mul_lo_u32 v4, v3, v1
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v3
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v1, v2, v3
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v1
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_i32:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; CGP-NEXT: v_xor_b32_e32 v4, v2, v3
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v3
-; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
-; CGP-NEXT: v_rcp_f32_e32 v2, v2
-; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
-; CGP-NEXT: v_mul_lo_u32 v3, v2, v1
-; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v2
+; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; CGP-NEXT: v_rcp_f32_e32 v3, v3
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v5
+; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
+; CGP-NEXT: v_xor_b32_e32 v2, v5, v2
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v3
+; CGP-NEXT: v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT: v_mul_hi_u32 v3, v0, v3
+; CGP-NEXT: v_mul_lo_u32 v4, v3, v1
+; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v2
+; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v1
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i32 %num, %den
ret i32 %result
@@ -80,22 +80,22 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i32 @s_sdiv_i32(i32 inreg %num, i32 inreg %den) {
; GISEL-LABEL: s_sdiv_i32:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_ashr_i32 s2, s0, 31
-; GISEL-NEXT: s_ashr_i32 s3, s1, 31
-; GISEL-NEXT: s_add_i32 s0, s0, s2
-; GISEL-NEXT: s_add_i32 s1, s1, s3
-; GISEL-NEXT: s_xor_b32 s1, s1, s3
+; GISEL-NEXT: s_ashr_i32 s2, s1, 31
+; GISEL-NEXT: s_add_i32 s1, s1, s2
+; GISEL-NEXT: s_xor_b32 s1, s1, s2
; GISEL-NEXT: v_cvt_f32_u32_e32 v0, s1
-; GISEL-NEXT: s_sub_i32 s4, 0, s1
+; GISEL-NEXT: s_sub_i32 s3, 0, s1
; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GISEL-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GISEL-NEXT: v_readfirstlane_b32 s5, v0
-; GISEL-NEXT: s_mul_i32 s4, s4, s5
-; GISEL-NEXT: v_mul_hi_u32 v0, v0, s4
; GISEL-NEXT: v_readfirstlane_b32 s4, v0
-; GISEL-NEXT: s_add_i32 s4, s5, s4
-; GISEL-NEXT: s_xor_b32 s0, s0, s2
+; GISEL-NEXT: s_mul_i32 s3, s3, s4
+; GISEL-NEXT: v_mul_hi_u32 v0, v0, s3
+; GISEL-NEXT: s_ashr_i32 s3, s0, 31
+; GISEL-NEXT: s_add_i32 s0, s0, s3
+; GISEL-NEXT: s_xor_b32 s0, s0, s3
+; GISEL-NEXT: v_readfirstlane_b32 s5, v0
+; GISEL-NEXT: s_add_i32 s4, s4, s5
; GISEL-NEXT: v_mov_b32_e32 v0, s4
; GISEL-NEXT: v_mul_hi_u32 v0, s0, v0
; GISEL-NEXT: v_readfirstlane_b32 s4, v0
@@ -114,50 +114,50 @@ define amdgpu_ps i32 @s_sdiv_i32(i32 inreg %num, i32 inreg %den) {
; GISEL-NEXT: s_add_i32 s1, s4, 1
; GISEL-NEXT: s_cmp_lg_u32 s0, 0
; GISEL-NEXT: s_cselect_b32 s0, s1, s4
-; GISEL-NEXT: s_xor_b32 s1, s2, s3
+; GISEL-NEXT: s_xor_b32 s1, s3, s2
; GISEL-NEXT: s_xor_b32 s0, s0, s1
; GISEL-NEXT: s_sub_i32 s0, s0, s1
; GISEL-NEXT: ; return to shader part epilog
;
; CGP-LABEL: s_sdiv_i32:
; CGP: ; %bb.0:
-; CGP-NEXT: s_ashr_i32 s2, s0, 31
-; CGP-NEXT: s_ashr_i32 s3, s1, 31
-; CGP-NEXT: s_xor_b32 s4, s2, s3
-; CGP-NEXT: s_add_i32 s0, s0, s2
-; CGP-NEXT: s_add_i32 s1, s1, s3
-; CGP-NEXT: s_xor_b32 s1, s1, s3
+; CGP-NEXT: s_ashr_i32 s2, s1, 31
+; CGP-NEXT: s_add_i32 s1, s1, s2
+; CGP-NEXT: s_xor_b32 s1, s1, s2
; CGP-NEXT: v_cvt_f32_u32_e32 v0, s1
; CGP-NEXT: s_sub_i32 s3, 0, s1
; CGP-NEXT: v_rcp_f32_e32 v0, v0
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT: v_readfirstlane_b32 s5, v0
-; CGP-NEXT: s_mul_i32 s3, s3, s5
+; CGP-NEXT: v_readfirstlane_b32 s4, v0
+; CGP-NEXT: s_mul_i32 s3, s3, s4
; CGP-NEXT: v_mul_hi_u32 v0, v0, s3
-; CGP-NEXT: v_readfirstlane_b32 s3, v0
-; CGP-NEXT: s_add_i32 s3, s5, s3
-; CGP-NEXT: s_xor_b32 s0, s0, s2
-; CGP-NEXT: v_mov_b32_e32 v0, s3
+; CGP-NEXT: s_ashr_i32 s3, s0, 31
+; CGP-NEXT: s_add_i32 s0, s0, s3
+; CGP-NEXT: s_xor_b32 s0, s0, s3
+; CGP-NEXT: v_readfirstlane_b32 s5, v0
+; CGP-NEXT: s_add_i32 s4, s4, s5
+; CGP-NEXT: v_mov_b32_e32 v0, s4
; CGP-NEXT: v_mul_hi_u32 v0, s0, v0
-; CGP-NEXT: v_readfirstlane_b32 s2, v0
-; CGP-NEXT: s_mul_i32 s3, s2, s1
-; CGP-NEXT: s_sub_i32 s0, s0, s3
+; CGP-NEXT: s_xor_b32 s2, s3, s2
+; CGP-NEXT: v_readfirstlane_b32 s3, v0
+; CGP-NEXT: s_mul_i32 s4, s3, s1
+; CGP-NEXT: s_sub_i32 s0, s0, s4
; CGP-NEXT: s_cmp_ge_u32 s0, s1
-; CGP-NEXT: s_cselect_b32 s3, 1, 0
-; CGP-NEXT: s_add_i32 s5, s2, 1
-; CGP-NEXT: s_cmp_lg_u32 s3, 0
-; CGP-NEXT: s_cselect_b32 s2, s5, s2
+; CGP-NEXT: s_cselect_b32 s4, 1, 0
+; CGP-NEXT: s_add_i32 s5, s3, 1
+; CGP-NEXT: s_cmp_lg_u32 s4, 0
+; CGP-NEXT: s_cselect_b32 s3, s5, s3
; CGP-NEXT: s_sub_i32 s5, s0, s1
-; CGP-NEXT: s_cmp_lg_u32 s3, 0
+; CGP-NEXT: s_cmp_lg_u32 s4, 0
; CGP-NEXT: s_cselect_b32 s0, s5, s0
; CGP-NEXT: s_cmp_ge_u32 s0, s1
; CGP-NEXT: s_cselect_b32 s0, 1, 0
-; CGP-NEXT: s_add_i32 s1, s2, 1
+; CGP-NEXT: s_add_i32 s1, s3, 1
; CGP-NEXT: s_cmp_lg_u32 s0, 0
-; CGP-NEXT: s_cselect_b32 s0, s1, s2
-; CGP-NEXT: s_xor_b32 s0, s0, s4
-; CGP-NEXT: s_sub_i32 s0, s0, s4
+; CGP-NEXT: s_cselect_b32 s0, s1, s3
+; CGP-NEXT: s_xor_b32 s0, s0, s2
+; CGP-NEXT: s_sub_i32 s0, s0, s2
; CGP-NEXT: ; return to shader part epilog
%result = sdiv i32 %num, %den
%readlane = call i32 @llvm.amdgcn.readfirstlane(i32 %result)
@@ -168,123 +168,123 @@ define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GISEL-LABEL: v_sdiv_v2i32:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v2
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v8, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_xor_b32_e32 v9, v6, v7
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
-; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v5, v3
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v5
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_mul_lo_u32 v8, v8, v5
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6
+; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v1
+; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
+; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v5, v2
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v5
+; GISEL-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
+; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
+; GISEL-NEXT: v_mul_lo_u32 v9, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v11
+; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v5
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v6, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v3
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v8
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v8
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v9
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v11, v7
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v2
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i32:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1
+; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v4
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0, v2
; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; CGP-NEXT: v_xor_b32_e32 v8, v4, v5
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT: v_xor_b32_e32 v9, v6, v7
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v5
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v6
+; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
; CGP-NEXT: v_xor_b32_e32 v3, v3, v7
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
-; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
-; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v8, v8, v5
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v6
+; CGP-NEXT: v_xor_b32_e32 v4, v6, v4
+; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v1
+; CGP-NEXT: v_mul_hi_u32 v8, v5, v8
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3
+; CGP-NEXT: v_mul_hi_u32 v5, v0, v5
+; CGP-NEXT: v_rcp_f32_e32 v6, v8
+; CGP-NEXT: v_mul_lo_u32 v8, v5, v2
+; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; CGP-NEXT: v_mul_lo_u32 v10, v5, v3
-; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v5
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[4:5]
-; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
+; CGP-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, 0, v3
+; CGP-NEXT: v_mul_lo_u32 v8, v8, v6
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v10
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v10
+; CGP-NEXT: v_mul_hi_u32 v8, v6, v8
+; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CGP-NEXT: v_mul_hi_u32 v6, v1, v6
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v9, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT: v_mul_lo_u32 v2, v6, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_xor_b32_e32 v4, v10, v7
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v6
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v8
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v9
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
+; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v4
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = sdiv <2 x i32> %num, %den
ret <2 x i32> %result
@@ -304,18 +304,31 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
}
define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
-; CHECK-LABEL: v_sdiv_v2i32_pow2k_denom:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2
-; CHECK-NEXT: v_lshrrev_b32_e32 v3, 20, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0
-; CHECK-NEXT: v_ashrrev_i32_e32 v1, 12, v1
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GISEL-LABEL: v_sdiv_v2i32_pow2k_denom:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
+; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v2, 20, v2
+; GISEL-NEXT: v_lshrrev_b32_e32 v3, 20, v3
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT: v_ashrrev_i32_e32 v0, 12, v0
+; GISEL-NEXT: v_ashrrev_i32_e32 v1, 12, v1
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_sdiv_v2i32_pow2k_denom:
+; CGP: ; %bb.0:
+; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0
+; CGP-NEXT: v_lshrrev_b32_e32 v2, 20, v2
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; CGP-NEXT: v_lshrrev_b32_e32 v2, 20, v2
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT: v_ashrrev_i32_e32 v0, 12, v0
+; CGP-NEXT: v_ashrrev_i32_e32 v1, 12, v1
+; CGP-NEXT: s_setpc_b64 s[30:31]
%result = sdiv <2 x i32> %num, <i32 4096, i32 4096>
ret <2 x i32> %result
}
@@ -339,48 +352,48 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
; GISEL-LABEL: v_sdiv_v2i32_oddk_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT: v_mov_b32_e32 v3, 0x12d8fb
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb
+; GISEL-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
+; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v3
; GISEL-NEXT: v_mov_b32_e32 v5, 0xffed2705
+; GISEL-NEXT: v_mov_b32_e32 v4, 0x12d8fb
+; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GISEL-NEXT: s_mul_i32 s5, s4, 0xffed2705
+; GISEL-NEXT: v_mul_hi_u32 v2, v2, s5
+; GISEL-NEXT: v_readfirstlane_b32 s5, v2
+; GISEL-NEXT: s_add_i32 s6, s4, s5
+; GISEL-NEXT: v_mul_hi_u32 v2, v0, s6
+; GISEL-NEXT: v_mul_lo_u32 v7, v2, v5
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v2
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v0, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_readfirstlane_b32 s4, v4
-; GISEL-NEXT: s_mul_i32 s5, s4, 0xffed2705
-; GISEL-NEXT: v_mul_hi_u32 v4, v4, s5
-; GISEL-NEXT: v_readfirstlane_b32 s5, v4
-; GISEL-NEXT: s_add_i32 s4, s4, s5
-; GISEL-NEXT: v_mul_hi_u32 v4, v0, s4
-; GISEL-NEXT: v_mul_hi_u32 v7, v1, s4
-; GISEL-NEXT: v_mul_lo_u32 v8, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v10
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v0, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 0xffed2705, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, 1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v7
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v8, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_mul_hi_u32 v8, v1, s6
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v2
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v7, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v8, v5
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v3
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v8
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 0xffed2705, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v3, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v2
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -394,10 +407,10 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CGP-NEXT: v_ashrrev_i32_e32 v0, 20, v0
; CGP-NEXT: v_ashrrev_i32_e32 v1, 20, v1
-; CGP-NEXT: v_lshrrev_b32_e32 v2, 31, v0
-; CGP-NEXT: v_lshrrev_b32_e32 v3, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CGP-NEXT: v_lshrrev_b32_e32 v3, 31, v0
+; CGP-NEXT: v_lshrrev_b32_e32 v2, 31, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = sdiv <2 x i32> %num, <i32 1235195, i32 1235195>
ret <2 x i32> %result
@@ -408,34 +421,34 @@ define i32 @v_sdiv_i32_pow2_shl_denom(i32 %x, i32 %y) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_lshl_b32_e32 v1, 0x1000, v1
-; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT: v_cvt_f32_u32_e32 v4, v1
-; CHECK-NEXT: v_sub_i32_e32 v5, vcc, 0, v1
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; CHECK-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CHECK-NEXT: v_mul_lo_u32 v5, v5, v4
-; CHECK-NEXT: v_mul_hi_u32 v5, v4, v5
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT: v_mul_hi_u32 v4, v0, v4
-; CHECK-NEXT: v_mul_lo_u32 v5, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
+; CHECK-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v5
+; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT: v_xor_b32_e32 v2, v5, v2
+; CHECK-NEXT: v_mul_lo_u32 v4, v4, v3
+; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3
+; CHECK-NEXT: v_mul_lo_u32 v4, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v3
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
-; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v4
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; CHECK-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v3
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
-; CHECK-NEXT: v_xor_b32_e32 v1, v2, v3
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: s_setpc_b64 s[30:31]
%shl.y = shl i32 4096, %y
%r = sdiv i32 %x, %shl.y
@@ -447,125 +460,125 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
-; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
-; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v4, v4, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT: v_xor_b32_e32 v5, v5, v7
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v2
+; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v2
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
+; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v8
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v6
+; GISEL-NEXT: v_ashrrev_i32_e32 v11, 31, v1
+; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3
+; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v7
+; GISEL-NEXT: v_mul_lo_u32 v9, v5, v2
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v5
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GISEL-NEXT: v_sub_i32_e64 v9, s[4:5], v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc
; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GISEL-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v9
-; GISEL-NEXT: v_mul_hi_u32 v6, v0, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v11
+; GISEL-NEXT: v_mul_hi_u32 v9, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT: v_mul_lo_u32 v8, v6, v2
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v6
-; GISEL-NEXT: v_mul_lo_u32 v10, v7, v3
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v7
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v2
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v11, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v9, s[6:7], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v7
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v8, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v6, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v10, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, v7, v3
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v7
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v9, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v11, v8
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v2
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i32_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
-; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
-; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v2
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; CGP-NEXT: v_xor_b32_e32 v8, v4, v6
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT: v_xor_b32_e32 v4, v5, v7
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v6
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v7
+; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v4
; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2
+; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
; CGP-NEXT: v_rcp_f32_e32 v5, v5
-; CGP-NEXT: v_rcp_f32_e32 v7, v7
+; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v3
+; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v0
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8
; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; CGP-NEXT: v_mul_f32_e32 v7, 0x4f7ffffe, v7
; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT: v_mul_lo_u32 v6, v6, v5
-; CGP-NEXT: v_mul_lo_u32 v9, v9, v7
-; CGP-NEXT: v_mul_hi_u32 v6, v5, v6
-; CGP-NEXT: v_mul_hi_u32 v9, v7, v9
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v9
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v8
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_xor_b32_e32 v4, v6, v4
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v6
+; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
+; CGP-NEXT: v_mul_hi_u32 v7, v5, v7
+; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v1
+; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; CGP-NEXT: v_mul_hi_u32 v5, v0, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v1, v6
+; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
; CGP-NEXT: v_mul_lo_u32 v7, v5, v2
; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; CGP-NEXT: v_mul_lo_u32 v10, v6, v3
-; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v6
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v10
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; CGP-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5]
-; CGP-NEXT: v_sub_i32_e64 v9, s[6:7], v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
-; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v6
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v10
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v10
+; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
+; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CGP-NEXT: v_mul_hi_u32 v6, v1, v6
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v5, v9, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT: v_mul_lo_u32 v2, v6, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_xor_b32_e32 v4, v10, v8
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v6
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v6, v9, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v8
+; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
+; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v5, vcc
; CGP-NEXT: v_xor_b32_e32 v1, v1, v4
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v4
; CGP-NEXT: s_setpc_b64 s[30:31]
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
@@ -577,10 +590,10 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
; GISEL-LABEL: v_sdiv_i32_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -603,10 +616,10 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
; CGP-LABEL: v_sdiv_i32_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -635,95 +648,95 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-LABEL: v_sdiv_v2i32_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v4
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
+; GISEL-NEXT: v_mul_hi_u32 v6, v5, v7
+; GISEL-NEXT: v_mul_lo_u32 v7, v4, v2
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v5, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v5
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i32_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v4
+; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v7
+; CGP-NEXT: v_mul_lo_u32 v7, v4, v2
+; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
-; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
+; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v5, v3
+; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v5
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
%den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/selected-inst-flags.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/selected-inst-flags.mir
index c87284fade303..61aeb55b910fd 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/selected-inst-flags.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/selected-inst-flags.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn-- -run-pass=instruction-select -o - %s | FileCheck %s
+# RUN: llc -mtriple=amdgcn-- -mcpu=gfx600 -run-pass=instruction-select -o - %s | FileCheck %s
# Checks MI Flags are preserved on selected instructions.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
index a18562d23168e..b0d3a1e482f14 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
-# RUN: llc -mtriple=amdgcn -run-pass=legalizer %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx600 -run-pass=legalizer %s -o - | FileCheck %s
---
name: test
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index 2cc1949bdca24..27f58ff9f0438 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,GISEL %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,CGP %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 < %s | FileCheck -check-prefixes=CHECK,GISEL %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 < %s | FileCheck -check-prefixes=CHECK,CGP %s
; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
@@ -8,61 +8,61 @@ define i32 @v_srem_i32(i32 %num, i32 %den) {
; GISEL-LABEL: v_srem_i32:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v1
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v3
-; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT: v_mul_hi_u32 v3, v0, v3
-; GISEL-NEXT: v_mul_lo_u32 v3, v3, v1
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
+; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v2
+; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
+; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
+; GISEL-NEXT: v_mul_lo_u32 v3, v3, v2
+; GISEL-NEXT: v_mul_hi_u32 v3, v2, v3
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT: v_mul_hi_u32 v2, v0, v2
+; GISEL-NEXT: v_mul_lo_u32 v2, v2, v1
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_i32:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; CGP-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v3
-; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
-; CGP-NEXT: v_rcp_f32_e32 v3, v3
-; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT: v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT: v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v3, v0, v3
-; CGP-NEXT: v_mul_lo_u32 v3, v3, v1
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
+; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v2
+; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0
+; CGP-NEXT: v_rcp_f32_e32 v2, v2
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
+; CGP-NEXT: v_mul_lo_u32 v2, v2, v1
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem i32 %num, %den
ret i32 %result
@@ -74,22 +74,22 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i32 @s_srem_i32(i32 inreg %num, i32 inreg %den) {
; GISEL-LABEL: s_srem_i32:
; GISEL: ; %bb.0:
-; GISEL-NEXT: s_ashr_i32 s2, s0, 31
-; GISEL-NEXT: s_ashr_i32 s3, s1, 31
-; GISEL-NEXT: s_add_i32 s0, s0, s2
-; GISEL-NEXT: s_add_i32 s1, s1, s3
-; GISEL-NEXT: s_xor_b32 s1, s1, s3
+; GISEL-NEXT: s_ashr_i32 s2, s1, 31
+; GISEL-NEXT: s_add_i32 s1, s1, s2
+; GISEL-NEXT: s_xor_b32 s1, s1, s2
; GISEL-NEXT: v_cvt_f32_u32_e32 v0, s1
-; GISEL-NEXT: s_sub_i32 s3, 0, s1
+; GISEL-NEXT: s_sub_i32 s2, 0, s1
; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
; GISEL-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; GISEL-NEXT: v_cvt_u32_f32_e32 v0, v0
-; GISEL-NEXT: v_readfirstlane_b32 s4, v0
-; GISEL-NEXT: s_mul_i32 s3, s3, s4
-; GISEL-NEXT: v_mul_hi_u32 v0, v0, s3
; GISEL-NEXT: v_readfirstlane_b32 s3, v0
-; GISEL-NEXT: s_add_i32 s3, s4, s3
+; GISEL-NEXT: s_mul_i32 s2, s2, s3
+; GISEL-NEXT: v_mul_hi_u32 v0, v0, s2
+; GISEL-NEXT: s_ashr_i32 s2, s0, 31
+; GISEL-NEXT: s_add_i32 s0, s0, s2
; GISEL-NEXT: s_xor_b32 s0, s0, s2
+; GISEL-NEXT: v_readfirstlane_b32 s4, v0
+; GISEL-NEXT: s_add_i32 s3, s3, s4
; GISEL-NEXT: v_mov_b32_e32 v0, s3
; GISEL-NEXT: v_mul_hi_u32 v0, s0, v0
; GISEL-NEXT: v_readfirstlane_b32 s3, v0
@@ -111,22 +111,22 @@ define amdgpu_ps i32 @s_srem_i32(i32 inreg %num, i32 inreg %den) {
;
; CGP-LABEL: s_srem_i32:
; CGP: ; %bb.0:
-; CGP-NEXT: s_ashr_i32 s2, s0, 31
-; CGP-NEXT: s_ashr_i32 s3, s1, 31
-; CGP-NEXT: s_add_i32 s0, s0, s2
-; CGP-NEXT: s_add_i32 s1, s1, s3
-; CGP-NEXT: s_xor_b32 s1, s1, s3
+; CGP-NEXT: s_ashr_i32 s2, s1, 31
+; CGP-NEXT: s_add_i32 s1, s1, s2
+; CGP-NEXT: s_xor_b32 s1, s1, s2
; CGP-NEXT: v_cvt_f32_u32_e32 v0, s1
-; CGP-NEXT: s_sub_i32 s3, 0, s1
+; CGP-NEXT: s_sub_i32 s2, 0, s1
; CGP-NEXT: v_rcp_f32_e32 v0, v0
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT: v_readfirstlane_b32 s4, v0
-; CGP-NEXT: s_mul_i32 s3, s3, s4
-; CGP-NEXT: v_mul_hi_u32 v0, v0, s3
; CGP-NEXT: v_readfirstlane_b32 s3, v0
-; CGP-NEXT: s_add_i32 s3, s4, s3
+; CGP-NEXT: s_mul_i32 s2, s2, s3
+; CGP-NEXT: v_mul_hi_u32 v0, v0, s2
+; CGP-NEXT: s_ashr_i32 s2, s0, 31
+; CGP-NEXT: s_add_i32 s0, s0, s2
; CGP-NEXT: s_xor_b32 s0, s0, s2
+; CGP-NEXT: v_readfirstlane_b32 s4, v0
+; CGP-NEXT: s_add_i32 s3, s3, s4
; CGP-NEXT: v_mov_b32_e32 v0, s3
; CGP-NEXT: v_mul_hi_u32 v0, s0, v0
; CGP-NEXT: v_readfirstlane_b32 s3, v0
@@ -154,111 +154,111 @@ define <2 x i32> @v_srem_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GISEL-LABEL: v_srem_v2i32:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v5
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v2
+; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; GISEL-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v9
-; GISEL-NEXT: v_mul_hi_u32 v5, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v2
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
+; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v6
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v4
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
+; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT: v_mul_hi_u32 v7, v4, v7
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
+; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; GISEL-NEXT: v_mul_lo_u32 v4, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v8
+; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v0, v2
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_mul_lo_u32 v2, v4, v3
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v8
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i32:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v2
-; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v5
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v7
-; CGP-NEXT: v_cvt_f32_u32_e32 v5, v2
+; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v4
+; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
-; CGP-NEXT: v_rcp_f32_e32 v5, v5
-; CGP-NEXT: v_rcp_f32_e32 v8, v8
-; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
-; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8
-; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
-; CGP-NEXT: v_mul_lo_u32 v9, v9, v8
-; CGP-NEXT: v_mul_hi_u32 v7, v5, v7
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v5, v0, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v2
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
+; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v3
+; CGP-NEXT: v_rcp_f32_e32 v4, v4
+; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT: v_xor_b32_e32 v3, v3, v6
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v4
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v5
+; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
+; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; CGP-NEXT: v_mul_hi_u32 v7, v4, v7
+; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
+; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
+; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v6
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v8
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v8
+; CGP-NEXT: v_mul_hi_u32 v4, v6, v4
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v2
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v1, v4
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v5
+; CGP-NEXT: v_mul_lo_u32 v2, v4, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v8
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i32> %num, %den
ret <2 x i32> %result
@@ -279,20 +279,35 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) {
}
define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
-; CHECK-LABEL: v_srem_v2i32_pow2k_denom:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; CHECK-NEXT: v_and_b32_e32 v2, 0xfff, v2
-; CHECK-NEXT: v_and_b32_e32 v3, 0xfff, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; CHECK-NEXT: v_and_b32_e32 v0, 0xfff, v0
-; CHECK-NEXT: v_and_b32_e32 v1, 0xfff, v1
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GISEL-LABEL: v_srem_v2i32_pow2k_denom:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
+; GISEL-NEXT: v_ashrrev_i32_e32 v3, 31, v1
+; GISEL-NEXT: v_and_b32_e32 v2, 0xfff, v2
+; GISEL-NEXT: v_and_b32_e32 v3, 0xfff, v3
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT: v_and_b32_e32 v0, 0xfff, v0
+; GISEL-NEXT: v_and_b32_e32 v1, 0xfff, v1
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_srem_v2i32_pow2k_denom:
+; CGP: ; %bb.0:
+; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0
+; CGP-NEXT: v_and_b32_e32 v2, 0xfff, v2
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT: v_and_b32_e32 v0, 0xfff, v0
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; CGP-NEXT: v_and_b32_e32 v2, 0xfff, v2
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT: v_and_b32_e32 v1, 0xfff, v1
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i32> %num, <i32 4096, i32 4096>
ret <2 x i32> %result
}
@@ -319,44 +334,44 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
; GISEL-LABEL: v_srem_v2i32_oddk_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT: v_mov_b32_e32 v3, 0x12d8fb
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x12d8fb
-; GISEL-NEXT: v_mov_b32_e32 v5, 0xffed2705
+; GISEL-NEXT: v_cvt_f32_u32_e32 v2, 0x12d8fb
+; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_mov_b32_e32 v4, 0xffed2705
; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_readfirstlane_b32 s4, v4
+; GISEL-NEXT: v_mov_b32_e32 v3, 0x12d8fb
+; GISEL-NEXT: v_readfirstlane_b32 s4, v2
; GISEL-NEXT: s_mul_i32 s5, s4, 0xffed2705
-; GISEL-NEXT: v_mul_hi_u32 v4, v4, s5
-; GISEL-NEXT: v_readfirstlane_b32 s5, v4
+; GISEL-NEXT: v_mul_hi_u32 v2, v2, s5
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT: v_readfirstlane_b32 s5, v2
; GISEL-NEXT: s_add_i32 s4, s4, s5
-; GISEL-NEXT: v_mul_hi_u32 v4, v0, s4
+; GISEL-NEXT: v_mul_hi_u32 v2, v0, s4
; GISEL-NEXT: v_mul_hi_u32 v7, v1, s4
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v5
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 0xffed2705, v1
+; GISEL-NEXT: v_mul_lo_u32 v2, v2, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v0, v4
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v0, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, 0xffed2705, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v0, v4
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GISEL-NEXT: v_mul_lo_u32 v2, v7, v4
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v1
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 0xffed2705, v1
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -364,20 +379,20 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_mov_b32_e32 v2, 0xd9528441
-; CGP-NEXT: v_mov_b32_e32 v3, 0xffed2705
-; CGP-NEXT: v_mul_hi_i32 v4, v0, v2
+; CGP-NEXT: v_mul_hi_i32 v3, v0, v2
; CGP-NEXT: v_mul_hi_i32 v2, v1, v2
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v0
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v1
-; CGP-NEXT: v_ashrrev_i32_e32 v4, 20, v4
+; CGP-NEXT: v_ashrrev_i32_e32 v3, 20, v3
; CGP-NEXT: v_ashrrev_i32_e32 v2, 20, v2
-; CGP-NEXT: v_lshrrev_b32_e32 v5, 31, v4
-; CGP-NEXT: v_lshrrev_b32_e32 v6, 31, v2
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT: v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT: v_mul_lo_u32 v2, v2, v3
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_lshrrev_b32_e32 v4, 31, v3
+; CGP-NEXT: v_lshrrev_b32_e32 v5, 31, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT: v_mov_b32_e32 v4, 0xffed2705
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT: v_mul_lo_u32 v3, v3, v4
+; CGP-NEXT: v_mul_lo_u32 v2, v2, v4
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = srem <2 x i32> %num, <i32 1235195, i32 1235195>
@@ -389,31 +404,31 @@ define i32 @v_srem_i32_pow2_shl_denom(i32 %x, i32 %y) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_lshl_b32_e32 v1, 0x1000, v1
-; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v3
-; CHECK-NEXT: v_cvt_f32_u32_e32 v3, v1
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, 0, v1
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v3, v3
-; CHECK-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT: v_mul_lo_u32 v4, v4, v3
-; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3
-; CHECK-NEXT: v_mul_lo_u32 v3, v3, v1
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
+; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_xor_b32_e32 v1, v1, v2
+; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v1
+; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; CHECK-NEXT: v_ashrrev_i32_e32 v4, 31, v0
+; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v4
+; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT: v_mul_lo_u32 v3, v3, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2
+; CHECK-NEXT: v_mul_lo_u32 v2, v2, v1
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v1
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v2
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; CHECK-NEXT: v_xor_b32_e32 v0, v0, v4
+; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
; CHECK-NEXT: s_setpc_b64 s[30:31]
%shl.y = shl i32 4096, %y
%r = srem i32 %x, %shl.y
@@ -425,114 +440,114 @@ define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
+; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
-; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v1
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
; GISEL-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v6
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; GISEL-NEXT: v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v2
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v3
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v4
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3
+; GISEL-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7
+; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GISEL-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v9
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v9
-; GISEL-NEXT: v_mul_hi_u32 v6, v0, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT: v_mul_lo_u32 v6, v6, v2
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v3
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; GISEL-NEXT: v_mul_lo_u32 v4, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v8
+; GISEL-NEXT: v_mul_hi_u32 v4, v6, v4
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v0, v2
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_mul_lo_u32 v2, v4, v3
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v8
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i32_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
+; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v2
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_xor_b32_e32 v2, v2, v4
+; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
-; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v0
-; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v2
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_rcp_f32_e32 v4, v4
; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v3
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
+; CGP-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v6
+; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
; CGP-NEXT: v_xor_b32_e32 v3, v3, v7
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v2
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v8, v3
-; CGP-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
-; CGP-NEXT: v_rcp_f32_e32 v8, v8
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v4
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v5
+; CGP-NEXT: v_cvt_f32_u32_e32 v7, v3
+; CGP-NEXT: v_ashrrev_i32_e32 v8, 31, v1
+; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
+; CGP-NEXT: v_rcp_f32_e32 v6, v7
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; CGP-NEXT: v_mul_f32_e32 v8, 0x4f7ffffe, v8
; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_lo_u32 v9, v9, v8
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v9
-; CGP-NEXT: v_mul_hi_u32 v6, v0, v6
-; CGP-NEXT: v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT: v_mul_lo_u32 v6, v6, v2
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v6
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v8
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v8
+; CGP-NEXT: v_mul_hi_u32 v4, v6, v4
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v2
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v1, v4
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT: v_xor_b32_e32 v0, v0, v5
+; CGP-NEXT: v_mul_lo_u32 v2, v4, v3
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v4
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v5
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT: v_xor_b32_e32 v1, v1, v8
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
; CGP-NEXT: s_setpc_b64 s[30:31]
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = srem <2 x i32> %x, %shl.y
@@ -543,10 +558,10 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
; GISEL-LABEL: v_srem_i32_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -567,10 +582,10 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
; CGP-LABEL: v_srem_i32_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -597,87 +612,87 @@ define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-LABEL: v_srem_v2i32_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v4
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v5, v7
; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_srem_v2i32_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v4
+; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v7
; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v5, v3
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
%den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
index b88201f295ef0..16e452c0f3877 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,GISEL %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,CGP %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 < %s | FileCheck -check-prefixes=CHECK,GISEL %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 < %s | FileCheck -check-prefixes=CHECK,CGP %s
; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
@@ -133,86 +133,86 @@ define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
+; GISEL-NEXT: v_mul_hi_u32 v6, v5, v7
+; GISEL-NEXT: v_mul_lo_u32 v7, v4, v2
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v5, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v5
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_udiv_v2i32:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v7
+; CGP-NEXT: v_mul_lo_u32 v7, v4, v2
+; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
-; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
+; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v5, v3
+; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v5
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = udiv <2 x i32> %num, %den
ret <2 x i32> %result
@@ -309,90 +309,90 @@ define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
-; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
+; GISEL-NEXT: v_mul_hi_u32 v6, v5, v7
+; GISEL-NEXT: v_mul_lo_u32 v7, v4, v2
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v5, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v5
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_udiv_v2i32_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
-; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v7
+; CGP-NEXT: v_mul_lo_u32 v7, v4, v2
+; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
-; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
+; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v5, v3
+; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v5
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = udiv <2 x i32> %x, %shl.y
@@ -403,10 +403,10 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
; GISEL-LABEL: v_udiv_i32_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -429,10 +429,10 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
; CGP-LABEL: v_udiv_i32_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -461,95 +461,95 @@ define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-LABEL: v_udiv_v2i32_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v4
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
+; GISEL-NEXT: v_mul_hi_u32 v6, v5, v7
+; GISEL-NEXT: v_mul_lo_u32 v7, v4, v2
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
-; GISEL-NEXT: v_mul_lo_u32 v6, v4, v2
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v5, v3
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v5, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 1, v5
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_udiv_v2i32_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v4
+; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v7
+; CGP-NEXT: v_mul_lo_u32 v7, v4, v2
+; CGP-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
-; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
+; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v2
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT: v_mul_lo_u32 v7, v5, v3
+; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v7
+; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v5
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v2
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v5, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
%den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 9563581012294..029bcc06e6f85 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts < %s | FileCheck -check-prefixes=CHECK,GISEL %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts < %s | FileCheck -check-prefixes=CHECK,CGP %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts < %s | FileCheck -check-prefixes=CHECK,GISEL %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts < %s | FileCheck -check-prefixes=CHECK,CGP %s
; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
@@ -8,12 +8,12 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
; CHECK-LABEL: v_udiv_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v4, v0
; CHECK-NEXT: v_mov_b32_e32 v5, v1
+; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2
+; CHECK-NEXT: v_mov_b32_e32 v4, v0
; CHECK-NEXT: v_or_b32_e32 v1, v5, v3
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -26,126 +26,126 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
-; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
; CHECK-NEXT: v_mac_f32_e32 v6, 0x4f800000, v0
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v0
-; CHECK-NEXT: v_trunc_f32_e32 v6, v6
-; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6
-; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6
+; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; CHECK-NEXT: v_trunc_f32_e32 v1, v1
+; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT: v_mul_lo_u32 v8, v1, v6
-; CHECK-NEXT: v_mul_lo_u32 v9, v1, v0
-; CHECK-NEXT: v_mul_hi_u32 v10, v1, v0
+; CHECK-NEXT: v_mul_lo_u32 v8, v6, v1
+; CHECK-NEXT: v_mul_hi_u32 v9, v6, v0
; CHECK-NEXT: v_mul_lo_u32 v11, v7, v0
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_mul_hi_u32 v10, v0, v9
-; CHECK-NEXT: v_mul_lo_u32 v12, v6, v9
-; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v6, v0
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT: v_mul_hi_u32 v9, v0, v10
; CHECK-NEXT: v_mul_lo_u32 v11, v0, v8
-; CHECK-NEXT: v_mul_hi_u32 v13, v0, v8
-; CHECK-NEXT: v_mul_lo_u32 v14, v6, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT: v_mul_lo_u32 v12, v1, v10
+; CHECK-NEXT: v_mul_hi_u32 v10, v1, v10
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; CHECK-NEXT: v_mul_lo_u32 v11, v1, v8
; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v12
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CHECK-NEXT: v_mul_lo_u32 v8, v1, v0
-; CHECK-NEXT: v_mul_hi_u32 v9, v1, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; CHECK-NEXT: v_mul_hi_u32 v8, v6, v0
+; CHECK-NEXT: v_mul_lo_u32 v9, v6, v1
; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0
-; CHECK-NEXT: v_mul_lo_u32 v1, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v10, v0, v8
-; CHECK-NEXT: v_mul_lo_u32 v11, v6, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v9, v1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v7
-; CHECK-NEXT: v_mul_lo_u32 v7, v0, v1
-; CHECK-NEXT: v_mul_hi_u32 v9, v0, v1
-; CHECK-NEXT: v_mul_lo_u32 v12, v6, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v6, v1
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v6, v6, v0
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v0, v6
+; CHECK-NEXT: v_mul_lo_u32 v9, v0, v7
+; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v10, v0, v7
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc
-; CHECK-NEXT: v_mul_hi_u32 v6, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v7, v5, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
-; CHECK-NEXT: v_mul_lo_u32 v8, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v9, v4, v1
-; CHECK-NEXT: v_mul_lo_u32 v10, v5, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v1
+; CHECK-NEXT: v_mul_lo_u32 v9, v1, v7
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT: v_mul_hi_u32 v7, v1, v7
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT: v_mul_hi_u32 v6, v4, v0
+; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
+; CHECK-NEXT: v_mul_lo_u32 v8, v5, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; CHECK-NEXT: v_mul_hi_u32 v8, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v1, v5, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_mul_lo_u32 v7, v2, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v2, v0
-; CHECK-NEXT: v_mul_lo_u32 v9, v3, v0
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; CHECK-NEXT: v_mul_lo_u32 v6, v2, v1
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, 1, v0
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v10
-; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc
+; CHECK-NEXT: v_mul_hi_u32 v6, v2, v0
+; CHECK-NEXT: v_mul_lo_u32 v7, v2, v1
+; CHECK-NEXT: v_mul_lo_u32 v9, v3, v0
+; CHECK-NEXT: v_mul_lo_u32 v8, v2, v0
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v8
; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v5, v6, vcc
; CHECK-NEXT: v_sub_i32_e64 v5, s[4:5], v5, v6
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v6, v9, v6, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v4, v2
; CHECK-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7
+; CHECK-NEXT: v_addc_u32_e32 v4, vcc, 0, v8, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -156,14 +156,14 @@ define i64 @v_udiv_i64(i64 %num, i64 %den) {
; CHECK-NEXT: s_cbranch_execz .LBB0_2
; CHECK-NEXT: .LBB0_4:
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT: v_mul_lo_u32 v3, v3, v0
-; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0
+; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mul_lo_u32 v3, v0, v2
; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v0
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v4, v3
@@ -188,144 +188,144 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3]
; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2
; CHECK-NEXT: v_cmp_ne_u64_e64 s[4:5], s[4:5], 0
-; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
; CHECK-NEXT: s_mov_b32 s6, 1
-; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2
+; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_scc0 .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s3
; CHECK-NEXT: s_sub_u32 s4, 0, s2
-; CHECK-NEXT: v_mov_b32_e32 v2, s1
-; CHECK-NEXT: v_madmk_f32 v1, v1, 0x4f800000, v0
; CHECK-NEXT: s_subb_u32 s5, 0, s3
+; CHECK-NEXT: v_madmk_f32 v1, v1, 0x4f800000, v0
; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1
; CHECK-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1
-; CHECK-NEXT: v_trunc_f32_e32 v3, v3
-; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
+; CHECK-NEXT: v_trunc_f32_e32 v2, v2
+; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT: v_readfirstlane_b32 s6, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, s4, v1
; CHECK-NEXT: v_readfirstlane_b32 s7, v1
-; CHECK-NEXT: v_mul_hi_u32 v4, s4, v1
-; CHECK-NEXT: s_mul_i32 s8, s4, s6
; CHECK-NEXT: s_mul_i32 s9, s4, s7
-; CHECK-NEXT: v_readfirstlane_b32 s10, v4
-; CHECK-NEXT: s_mul_i32 s11, s5, s7
-; CHECK-NEXT: s_add_i32 s8, s10, s8
+; CHECK-NEXT: v_readfirstlane_b32 s6, v2
; CHECK-NEXT: v_mul_hi_u32 v4, v1, s9
-; CHECK-NEXT: s_mul_i32 s10, s6, s9
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, s9
-; CHECK-NEXT: s_add_i32 s8, s8, s11
-; CHECK-NEXT: v_readfirstlane_b32 s9, v4
-; CHECK-NEXT: v_readfirstlane_b32 s11, v5
+; CHECK-NEXT: s_mul_i32 s8, s4, s6
+; CHECK-NEXT: v_readfirstlane_b32 s12, v3
+; CHECK-NEXT: s_mul_i32 s10, s5, s7
+; CHECK-NEXT: s_add_i32 s8, s12, s8
+; CHECK-NEXT: s_add_i32 s8, s8, s10
+; CHECK-NEXT: v_readfirstlane_b32 s10, v4
; CHECK-NEXT: s_mul_i32 s12, s7, s8
+; CHECK-NEXT: v_mul_hi_u32 v3, v2, s9
; CHECK-NEXT: v_mul_hi_u32 v1, v1, s8
-; CHECK-NEXT: s_mul_i32 s13, s6, s8
-; CHECK-NEXT: v_mul_hi_u32 v3, v3, s8
-; CHECK-NEXT: s_add_u32 s8, s9, s12
-; CHECK-NEXT: v_readfirstlane_b32 s9, v1
-; CHECK-NEXT: v_readfirstlane_b32 s12, v3
-; CHECK-NEXT: s_cselect_b32 s14, 1, 0
-; CHECK-NEXT: s_add_u32 s8, s8, s10
-; CHECK-NEXT: s_cselect_b32 s8, 1, 0
-; CHECK-NEXT: s_add_i32 s14, s14, s8
-; CHECK-NEXT: s_add_u32 s8, s11, s9
+; CHECK-NEXT: s_mul_i32 s11, s6, s9
+; CHECK-NEXT: s_add_u32 s10, s10, s12
+; CHECK-NEXT: s_cselect_b32 s12, 1, 0
+; CHECK-NEXT: s_add_u32 s10, s10, s11
; CHECK-NEXT: s_cselect_b32 s9, 1, 0
-; CHECK-NEXT: s_add_u32 s8, s8, s13
-; CHECK-NEXT: s_cselect_b32 s10, 1, 0
-; CHECK-NEXT: s_add_i32 s9, s9, s10
-; CHECK-NEXT: s_add_u32 s8, s8, s14
+; CHECK-NEXT: s_add_i32 s12, s12, s9
+; CHECK-NEXT: v_readfirstlane_b32 s9, v3
+; CHECK-NEXT: v_readfirstlane_b32 s10, v1
+; CHECK-NEXT: s_mul_i32 s11, s6, s8
+; CHECK-NEXT: s_add_u32 s9, s9, s10
; CHECK-NEXT: s_cselect_b32 s10, 1, 0
+; CHECK-NEXT: s_add_u32 s9, s9, s11
+; CHECK-NEXT: s_cselect_b32 s11, 1, 0
+; CHECK-NEXT: v_mul_hi_u32 v1, v2, s8
+; CHECK-NEXT: s_add_i32 s10, s10, s11
+; CHECK-NEXT: s_add_u32 s8, s9, s12
+; CHECK-NEXT: s_cselect_b32 s9, 1, 0
+; CHECK-NEXT: s_add_i32 s10, s10, s9
+; CHECK-NEXT: v_readfirstlane_b32 s9, v1
; CHECK-NEXT: s_add_i32 s9, s9, s10
-; CHECK-NEXT: s_add_i32 s12, s12, s9
; CHECK-NEXT: s_add_u32 s7, s7, s8
-; CHECK-NEXT: s_addc_u32 s6, s6, s12
-; CHECK-NEXT: s_mul_i32 s8, s4, s7
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: v_mul_hi_u32 v1, s4, v1
-; CHECK-NEXT: s_mul_i32 s5, s5, s7
-; CHECK-NEXT: v_readfirstlane_b32 s9, v1
+; CHECK-NEXT: s_mul_i32 s8, s4, s7
+; CHECK-NEXT: s_addc_u32 s6, s6, s9
; CHECK-NEXT: s_mul_i32 s4, s4, s6
+; CHECK-NEXT: v_readfirstlane_b32 s9, v1
; CHECK-NEXT: v_mov_b32_e32 v1, s8
; CHECK-NEXT: v_mul_hi_u32 v1, s7, v1
-; CHECK-NEXT: s_mul_i32 s10, s6, s8
-; CHECK-NEXT: v_mov_b32_e32 v3, s6
-; CHECK-NEXT: v_mul_hi_u32 v3, v3, s8
+; CHECK-NEXT: s_mul_i32 s5, s5, s7
; CHECK-NEXT: s_add_i32 s4, s9, s4
-; CHECK-NEXT: v_readfirstlane_b32 s8, v1
-; CHECK-NEXT: v_readfirstlane_b32 s9, v3
; CHECK-NEXT: s_add_i32 s4, s4, s5
-; CHECK-NEXT: s_mul_i32 s5, s7, s4
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: v_mul_hi_u32 v3, s7, v1
-; CHECK-NEXT: s_mul_i32 s4, s6, s4
-; CHECK-NEXT: v_mul_hi_u32 v1, s6, v1
-; CHECK-NEXT: s_add_u32 s5, s8, s5
-; CHECK-NEXT: v_readfirstlane_b32 s8, v3
-; CHECK-NEXT: v_readfirstlane_b32 s11, v1
-; CHECK-NEXT: s_cselect_b32 s12, 1, 0
+; CHECK-NEXT: v_readfirstlane_b32 s5, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: s_mul_i32 s9, s7, s4
+; CHECK-NEXT: v_mul_hi_u32 v1, v1, s8
+; CHECK-NEXT: v_mul_hi_u32 v3, s7, v2
+; CHECK-NEXT: s_mul_i32 s10, s6, s8
+; CHECK-NEXT: s_add_u32 s5, s5, s9
+; CHECK-NEXT: s_cselect_b32 s9, 1, 0
; CHECK-NEXT: s_add_u32 s5, s5, s10
; CHECK-NEXT: s_cselect_b32 s5, 1, 0
-; CHECK-NEXT: s_add_i32 s12, s12, s5
-; CHECK-NEXT: s_add_u32 s5, s9, s8
+; CHECK-NEXT: s_add_i32 s9, s9, s5
+; CHECK-NEXT: v_readfirstlane_b32 s5, v1
+; CHECK-NEXT: v_readfirstlane_b32 s8, v3
+; CHECK-NEXT: s_mul_i32 s4, s6, s4
+; CHECK-NEXT: s_add_u32 s5, s5, s8
; CHECK-NEXT: s_cselect_b32 s8, 1, 0
; CHECK-NEXT: s_add_u32 s4, s5, s4
; CHECK-NEXT: s_cselect_b32 s5, 1, 0
+; CHECK-NEXT: v_mul_hi_u32 v1, s6, v2
; CHECK-NEXT: s_add_i32 s5, s8, s5
-; CHECK-NEXT: s_add_u32 s4, s4, s12
+; CHECK-NEXT: s_add_u32 s4, s4, s9
; CHECK-NEXT: s_cselect_b32 s8, 1, 0
; CHECK-NEXT: s_add_i32 s5, s5, s8
-; CHECK-NEXT: s_add_i32 s11, s11, s5
+; CHECK-NEXT: v_readfirstlane_b32 s8, v1
+; CHECK-NEXT: s_add_i32 s8, s8, s5
; CHECK-NEXT: s_add_u32 s4, s7, s4
-; CHECK-NEXT: s_addc_u32 s5, s6, s11
; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: v_mul_hi_u32 v1, s0, v1
-; CHECK-NEXT: s_mul_i32 s6, s1, s4
-; CHECK-NEXT: v_mul_hi_u32 v2, v2, s4
-; CHECK-NEXT: v_readfirstlane_b32 s4, v1
+; CHECK-NEXT: s_addc_u32 s5, s6, s8
+; CHECK-NEXT: v_mov_b32_e32 v2, s5
; CHECK-NEXT: s_mul_i32 s7, s0, s5
-; CHECK-NEXT: v_readfirstlane_b32 s8, v2
-; CHECK-NEXT: v_mov_b32_e32 v1, s5
-; CHECK-NEXT: v_mul_hi_u32 v2, s0, v1
+; CHECK-NEXT: v_readfirstlane_b32 s6, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_mul_hi_u32 v1, v1, s4
+; CHECK-NEXT: v_mul_hi_u32 v3, s0, v2
+; CHECK-NEXT: s_mul_i32 s8, s1, s4
+; CHECK-NEXT: s_add_u32 s6, s6, s7
+; CHECK-NEXT: s_cselect_b32 s7, 1, 0
+; CHECK-NEXT: s_add_u32 s6, s6, s8
+; CHECK-NEXT: s_cselect_b32 s4, 1, 0
+; CHECK-NEXT: s_add_i32 s7, s7, s4
+; CHECK-NEXT: v_readfirstlane_b32 s4, v1
+; CHECK-NEXT: v_readfirstlane_b32 s6, v3
; CHECK-NEXT: s_mul_i32 s5, s1, s5
-; CHECK-NEXT: v_mul_hi_u32 v1, s1, v1
-; CHECK-NEXT: s_add_u32 s4, s4, s7
-; CHECK-NEXT: v_readfirstlane_b32 s7, v2
-; CHECK-NEXT: v_readfirstlane_b32 s9, v1
-; CHECK-NEXT: s_cselect_b32 s10, 1, 0
; CHECK-NEXT: s_add_u32 s4, s4, s6
-; CHECK-NEXT: s_cselect_b32 s4, 1, 0
-; CHECK-NEXT: s_add_i32 s10, s10, s4
-; CHECK-NEXT: s_add_u32 s4, s8, s7
; CHECK-NEXT: s_cselect_b32 s6, 1, 0
; CHECK-NEXT: s_add_u32 s4, s4, s5
; CHECK-NEXT: s_cselect_b32 s5, 1, 0
; CHECK-NEXT: s_add_i32 s5, s6, s5
-; CHECK-NEXT: s_add_u32 s4, s4, s10
+; CHECK-NEXT: s_add_u32 s4, s4, s7
+; CHECK-NEXT: v_mul_hi_u32 v1, s1, v2
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: v_mul_hi_u32 v2, s2, v2
; CHECK-NEXT: s_cselect_b32 s6, 1, 0
-; CHECK-NEXT: s_mul_i32 s7, s2, s4
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: v_mul_hi_u32 v1, s2, v1
-; CHECK-NEXT: s_mul_i32 s8, s3, s4
; CHECK-NEXT: s_add_i32 s5, s5, s6
; CHECK-NEXT: v_readfirstlane_b32 s6, v1
-; CHECK-NEXT: s_add_i32 s5, s9, s5
-; CHECK-NEXT: s_mul_i32 s9, s2, s5
-; CHECK-NEXT: s_add_i32 s6, s6, s9
-; CHECK-NEXT: s_add_i32 s6, s6, s8
-; CHECK-NEXT: s_sub_u32 s7, s0, s7
+; CHECK-NEXT: s_add_i32 s5, s6, s5
+; CHECK-NEXT: v_readfirstlane_b32 s7, v2
+; CHECK-NEXT: s_mul_i32 s8, s2, s5
+; CHECK-NEXT: s_mul_i32 s9, s3, s4
+; CHECK-NEXT: s_add_i32 s7, s7, s8
+; CHECK-NEXT: s_mul_i32 s6, s2, s4
+; CHECK-NEXT: s_add_i32 s7, s7, s9
+; CHECK-NEXT: s_sub_u32 s6, s0, s6
; CHECK-NEXT: s_cselect_b32 s8, 1, 0
-; CHECK-NEXT: s_subb_u32 s9, s1, s6
-; CHECK-NEXT: s_sub_i32 s1, s1, s6
+; CHECK-NEXT: s_subb_u32 s9, s1, s7
+; CHECK-NEXT: s_sub_i32 s1, s1, s7
; CHECK-NEXT: s_cmp_ge_u32 s9, s3
-; CHECK-NEXT: s_cselect_b32 s6, -1, 0
-; CHECK-NEXT: s_cmp_ge_u32 s7, s2
+; CHECK-NEXT: s_cselect_b32 s7, -1, 0
+; CHECK-NEXT: s_cmp_ge_u32 s6, s2
; CHECK-NEXT: s_cselect_b32 s10, -1, 0
; CHECK-NEXT: s_cmp_eq_u32 s9, s3
-; CHECK-NEXT: s_cselect_b32 s10, s10, s6
-; CHECK-NEXT: s_sub_u32 s9, s7, s2
+; CHECK-NEXT: s_cselect_b32 s10, s10, s7
+; CHECK-NEXT: s_sub_u32 s9, s6, s2
; CHECK-NEXT: s_cselect_b32 s6, 1, 0
; CHECK-NEXT: s_cmp_lg_u32 s8, 0
; CHECK-NEXT: s_subb_u32 s1, s1, s3
@@ -365,24 +365,24 @@ define amdgpu_ps i64 @s_udiv_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_add_i32 s1, s3, s1
; CHECK-NEXT: v_mov_b32_e32 v0, s1
; CHECK-NEXT: v_mul_hi_u32 v0, s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v0
-; CHECK-NEXT: s_mul_i32 s3, s1, s2
-; CHECK-NEXT: s_sub_i32 s3, s0, s3
-; CHECK-NEXT: s_cmp_ge_u32 s3, s2
-; CHECK-NEXT: s_cselect_b32 s4, 1, 0
-; CHECK-NEXT: s_add_i32 s0, s1, 1
-; CHECK-NEXT: s_cmp_lg_u32 s4, 0
-; CHECK-NEXT: s_cselect_b32 s0, s0, s1
-; CHECK-NEXT: s_sub_i32 s1, s3, s2
-; CHECK-NEXT: s_cmp_lg_u32 s4, 0
-; CHECK-NEXT: s_cselect_b32 s1, s1, s3
-; CHECK-NEXT: s_cmp_ge_u32 s1, s2
-; CHECK-NEXT: s_cselect_b32 s4, 1, 0
-; CHECK-NEXT: s_mov_b32 s3, 0
-; CHECK-NEXT: s_add_i32 s2, s0, 1
-; CHECK-NEXT: s_mov_b32 s1, s3
-; CHECK-NEXT: s_cmp_lg_u32 s4, 0
-; CHECK-NEXT: s_cselect_b64 s[4:5], s[2:3], s[0:1]
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: v_readfirstlane_b32 s3, v0
+; CHECK-NEXT: s_mul_i32 s4, s3, s2
+; CHECK-NEXT: s_sub_i32 s0, s0, s4
+; CHECK-NEXT: s_cmp_ge_u32 s0, s2
+; CHECK-NEXT: s_cselect_b32 s5, 1, 0
+; CHECK-NEXT: s_add_i32 s4, s3, 1
+; CHECK-NEXT: s_cmp_lg_u32 s5, 0
+; CHECK-NEXT: s_cselect_b32 s4, s4, s3
+; CHECK-NEXT: s_sub_i32 s3, s0, s2
+; CHECK-NEXT: s_cmp_lg_u32 s5, 0
+; CHECK-NEXT: s_cselect_b32 s0, s3, s0
+; CHECK-NEXT: s_cmp_ge_u32 s0, s2
+; CHECK-NEXT: s_cselect_b32 s2, 1, 0
+; CHECK-NEXT: s_add_i32 s0, s4, 1
+; CHECK-NEXT: s_mov_b32 s5, s1
+; CHECK-NEXT: s_cmp_lg_u32 s2, 0
+; CHECK-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5]
; CHECK-NEXT: .LBB1_5: ; %.split
; CHECK-NEXT: s_mov_b32 s0, s4
; CHECK-NEXT: s_mov_b32 s1, s4
@@ -405,6 +405,8 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8
@@ -413,232 +415,230 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT: v_mul_hi_u32 v13, v10, v8
-; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9
; GISEL-NEXT: v_mul_lo_u32 v15, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_mul_lo_u32 v14, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v14
+; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12
+; GISEL-NEXT: v_mul_lo_u32 v16, v9, v14
+; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12
-; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13
-; GISEL-NEXT: v_mul_lo_u32 v16, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT: v_mul_lo_u32 v15, v9, v12
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT: v_mul_hi_u32 v15, v8, v13
-; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT: v_mul_hi_u32 v13, v10, v8
-; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc
+; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9
; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v12
-; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT: v_mul_hi_u32 v13, v8, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v10, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v11
; GISEL-NEXT: v_mul_lo_u32 v14, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v8, v11
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
; GISEL-NEXT: v_mul_hi_u32 v10, v0, v8
; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9
; GISEL-NEXT: v_mul_lo_u32 v12, v1, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
-; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9
-; GISEL-NEXT: v_mul_lo_u32 v12, v1, v9
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8
-; GISEL-NEXT: v_mul_hi_u32 v11, v4, v8
-; GISEL-NEXT: v_mul_lo_u32 v12, v4, v9
+; GISEL-NEXT: v_mul_hi_u32 v10, v4, v8
+; GISEL-NEXT: v_mul_lo_u32 v11, v4, v9
; GISEL-NEXT: v_mul_lo_u32 v13, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc
-; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v12, v4, v8
+; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v15
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v1, v10, vcc
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v10
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v4
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v5
; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v14
; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v12, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, 1, v11
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v12, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v4, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7
-; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v5, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v4, vcc
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v1
+; GISEL-NEXT: v_trunc_f32_e32 v4, v4
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v4
+; GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
-; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v4
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10
-; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v5, v10
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10
-; GISEL-NEXT: v_mul_hi_u32 v13, v4, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v5, v11
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v8, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v4, v10
-; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8
-; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10
-; GISEL-NEXT: v_mul_hi_u32 v11, v4, v8
-; GISEL-NEXT: v_mul_lo_u32 v12, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v6
+; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, v7, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v14, v11, v1
+; GISEL-NEXT: v_mul_lo_u32 v15, v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v17, v13, v1
+; GISEL-NEXT: v_mul_lo_u32 v16, v11, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v12, v5, vcc
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
+; GISEL-NEXT: v_mul_hi_u32 v15, v1, v16
+; GISEL-NEXT: v_mul_lo_u32 v17, v1, v14
+; GISEL-NEXT: v_mul_lo_u32 v12, v4, v16
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v15, v4, v16
+; GISEL-NEXT: v_mul_hi_u32 v16, v1, v14
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12
+; GISEL-NEXT: v_mul_lo_u32 v17, v4, v14
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17
+; GISEL-NEXT: v_mul_hi_u32 v14, v4, v14
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v4, v14, vcc
+; GISEL-NEXT: v_mul_hi_u32 v12, v11, v1
+; GISEL-NEXT: v_mul_lo_u32 v14, v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v13, v1
+; GISEL-NEXT: v_mul_lo_u32 v11, v11, v1
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v1, v11
+; GISEL-NEXT: v_mul_lo_u32 v14, v1, v12
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v4, v11
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v10, v4, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v1, v12
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8
+; GISEL-NEXT: v_mul_lo_u32 v13, v4, v12
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
+; GISEL-NEXT: v_mul_hi_u32 v12, v4, v12
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v10, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v1, v8
+; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], v4, v10, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v10, v2, v8
+; GISEL-NEXT: v_mul_lo_u32 v11, v2, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v3, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v3, v8
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT: v_mul_hi_u32 v8, v2, v4
-; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v2, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
; GISEL-NEXT: v_mul_lo_u32 v10, v3, v4
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT: v_mul_hi_u32 v9, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4
-; GISEL-NEXT: v_mul_hi_u32 v9, v6, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v6, v5
+; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4
+; GISEL-NEXT: v_mul_lo_u32 v11, v7, v5
; GISEL-NEXT: v_mul_lo_u32 v10, v6, v5
-; GISEL-NEXT: v_mul_lo_u32 v11, v7, v4
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v3, v8, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v8
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v6
+; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v6
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v4
-; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v4, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v7
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
@@ -651,147 +651,147 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v6, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_udiv_v2i64:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_mov_b32_e32 v10, v0
; CGP-NEXT: v_mov_b32_e32 v11, v1
; CGP-NEXT: v_mov_b32_e32 v8, v2
-; CGP-NEXT: v_mov_b32_e32 v9, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v2, v4
+; CGP-NEXT: v_mov_b32_e32 v10, v0
; CGP-NEXT: v_or_b32_e32 v1, v11, v5
; CGP-NEXT: v_mov_b32_e32 v0, 0
; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; CGP-NEXT: v_cvt_f32_u32_e32 v2, v4
+; CGP-NEXT: v_mov_b32_e32 v9, v3
; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; CGP-NEXT: s_cbranch_execz .LBB2_2
; CGP-NEXT: ; %bb.1:
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v5
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v0
; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CGP-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
-; CGP-NEXT: v_trunc_f32_e32 v2, v2
-; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; CGP-NEXT: v_trunc_f32_e32 v1, v1
+; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT: v_mul_lo_u32 v12, v1, v2
-; CGP-NEXT: v_mul_lo_u32 v13, v1, v0
-; CGP-NEXT: v_mul_hi_u32 v14, v1, v0
+; CGP-NEXT: v_mul_lo_u32 v12, v2, v1
+; CGP-NEXT: v_mul_hi_u32 v13, v2, v0
; CGP-NEXT: v_mul_lo_u32 v15, v3, v0
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT: v_mul_hi_u32 v14, v0, v13
-; CGP-NEXT: v_mul_lo_u32 v16, v2, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v2, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v2, v0
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT: v_mul_hi_u32 v13, v0, v14
; CGP-NEXT: v_mul_lo_u32 v15, v0, v12
-; CGP-NEXT: v_mul_hi_u32 v17, v0, v12
-; CGP-NEXT: v_mul_lo_u32 v18, v2, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v12
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_mul_lo_u32 v16, v1, v14
+; CGP-NEXT: v_mul_hi_u32 v14, v1, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v16, v0, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT: v_mul_lo_u32 v15, v1, v12
; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18
; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v16
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT: v_mul_hi_u32 v12, v1, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v2, v12, vcc
-; CGP-NEXT: v_mul_lo_u32 v12, v1, v0
-; CGP-NEXT: v_mul_hi_u32 v13, v1, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_mul_hi_u32 v12, v2, v0
+; CGP-NEXT: v_mul_lo_u32 v13, v2, v1
; CGP-NEXT: v_mul_lo_u32 v3, v3, v0
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v2
-; CGP-NEXT: v_mul_hi_u32 v14, v0, v12
-; CGP-NEXT: v_mul_lo_u32 v15, v2, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v12
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT: v_mul_lo_u32 v3, v0, v1
-; CGP-NEXT: v_mul_hi_u32 v13, v0, v1
-; CGP-NEXT: v_mul_lo_u32 v16, v2, v1
-; CGP-NEXT: v_mul_hi_u32 v1, v2, v1
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v2, v2, v0
; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15
-; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT: v_mul_hi_u32 v12, v0, v2
+; CGP-NEXT: v_mul_lo_u32 v13, v0, v3
+; CGP-NEXT: v_mul_lo_u32 v14, v1, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v1, v2
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v0, v3
; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; CGP-NEXT: v_mul_hi_u32 v2, v10, v0
-; CGP-NEXT: v_mul_lo_u32 v3, v11, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
-; CGP-NEXT: v_mul_lo_u32 v12, v10, v1
-; CGP-NEXT: v_mul_hi_u32 v13, v10, v1
-; CGP-NEXT: v_mul_lo_u32 v14, v11, v1
-; CGP-NEXT: v_mul_hi_u32 v1, v11, v1
+; CGP-NEXT: v_mul_lo_u32 v13, v1, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_mul_hi_u32 v3, v1, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; CGP-NEXT: v_mul_hi_u32 v2, v10, v0
+; CGP-NEXT: v_mul_lo_u32 v3, v10, v1
+; CGP-NEXT: v_mul_lo_u32 v12, v11, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT: v_mul_hi_u32 v12, v10, v1
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT: v_mul_hi_u32 v1, v11, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CGP-NEXT: v_mul_lo_u32 v3, v4, v0
-; CGP-NEXT: v_mul_hi_u32 v12, v4, v0
-; CGP-NEXT: v_mul_lo_u32 v13, v5, v0
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CGP-NEXT: v_mul_lo_u32 v2, v4, v1
-; CGP-NEXT: v_add_i32_e32 v14, vcc, 1, v0
-; CGP-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v14
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v15, vcc
+; CGP-NEXT: v_mul_hi_u32 v2, v4, v0
+; CGP-NEXT: v_mul_lo_u32 v3, v4, v1
+; CGP-NEXT: v_mul_lo_u32 v13, v5, v0
+; CGP-NEXT: v_mul_lo_u32 v12, v4, v0
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v10, v12
; CGP-NEXT: v_subb_u32_e64 v10, s[4:5], v11, v2, vcc
; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v11, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v5, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v10, v5
-; CGP-NEXT: v_cndmask_b32_e32 v10, v13, v11, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4
; CGP-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v5
; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v11, vcc, 1, v0
+; CGP-NEXT: v_addc_u32_e32 v12, vcc, 0, v1, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
-; CGP-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v13, v3, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v11
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v12, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v2, v14, v12, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v15, v16, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v4, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -803,14 +803,14 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: s_cbranch_execz .LBB2_4
; CGP-NEXT: ; %bb.3:
; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4
-; CGP-NEXT: v_mov_b32_e32 v1, 0
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT: v_mul_lo_u32 v2, v2, v0
-; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT: v_mul_lo_u32 v1, v1, v0
+; CGP-NEXT: v_mul_hi_u32 v1, v0, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CGP-NEXT: v_mul_hi_u32 v0, v10, v0
+; CGP-NEXT: v_mov_b32_e32 v1, 0
; CGP-NEXT: v_mul_lo_u32 v2, v0, v4
; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v0
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v10, v2
@@ -823,10 +823,10 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CGP-NEXT: .LBB2_4: ; %.split
; CGP-NEXT: s_or_b64 exec, exec, s[6:7]
+; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6
; CGP-NEXT: v_or_b32_e32 v3, v9, v7
; CGP-NEXT: v_mov_b32_e32 v2, 0
; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -839,126 +839,126 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: s_setpc_b64 s[30:31]
; CGP-NEXT: .LBB2_7:
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v7
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v2
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6
+; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2
-; CGP-NEXT: v_trunc_f32_e32 v4, v4
-; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
+; CGP-NEXT: v_trunc_f32_e32 v3, v3
+; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT: v_mul_lo_u32 v10, v3, v4
-; CGP-NEXT: v_mul_lo_u32 v11, v3, v2
-; CGP-NEXT: v_mul_hi_u32 v12, v3, v2
+; CGP-NEXT: v_mul_lo_u32 v10, v4, v3
+; CGP-NEXT: v_mul_hi_u32 v11, v4, v2
; CGP-NEXT: v_mul_lo_u32 v13, v5, v2
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v11
-; CGP-NEXT: v_mul_lo_u32 v14, v4, v11
-; CGP-NEXT: v_mul_hi_u32 v11, v4, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v4, v2
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT: v_mul_hi_u32 v11, v2, v12
; CGP-NEXT: v_mul_lo_u32 v13, v2, v10
-; CGP-NEXT: v_mul_hi_u32 v15, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v16, v4, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v4, v10
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v3, v12
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v2, v10
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v13, v3, v10
; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_mul_hi_u32 v10, v3, v10
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT: v_addc_u32_e32 v4, vcc, v4, v10, vcc
-; CGP-NEXT: v_mul_lo_u32 v10, v3, v2
-; CGP-NEXT: v_mul_hi_u32 v11, v3, v2
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_mul_hi_u32 v10, v4, v2
+; CGP-NEXT: v_mul_lo_u32 v11, v4, v3
; CGP-NEXT: v_mul_lo_u32 v5, v5, v2
-; CGP-NEXT: v_mul_lo_u32 v3, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v13, v4, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v4, v10
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_mul_lo_u32 v5, v2, v3
-; CGP-NEXT: v_mul_hi_u32 v11, v2, v3
-; CGP-NEXT: v_mul_lo_u32 v14, v4, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v4, v3
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v11, v2, v5
+; CGP-NEXT: v_mul_lo_u32 v12, v3, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v12, v2, v5
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; CGP-NEXT: v_mul_hi_u32 v4, v8, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v9, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v9, v2
-; CGP-NEXT: v_mul_lo_u32 v10, v8, v3
-; CGP-NEXT: v_mul_hi_u32 v11, v8, v3
-; CGP-NEXT: v_mul_lo_u32 v12, v9, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v9, v3
+; CGP-NEXT: v_mul_lo_u32 v11, v3, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_mul_hi_u32 v5, v3, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; CGP-NEXT: v_mul_hi_u32 v4, v8, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v10, v9, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v9, v2
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v3
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT: v_mul_lo_u32 v5, v9, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v3, v9, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v5, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v10, v6, v2
-; CGP-NEXT: v_mul_lo_u32 v11, v7, v2
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v6, v3
-; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v2
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v12
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc
+; CGP-NEXT: v_mul_hi_u32 v4, v6, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v6, v3
+; CGP-NEXT: v_mul_lo_u32 v11, v7, v2
+; CGP-NEXT: v_mul_lo_u32 v10, v6, v2
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v8, v10
; CGP-NEXT: v_subb_u32_e64 v8, s[4:5], v9, v4, vcc
; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v9, v4
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v7
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v7, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v8, v7
-; CGP-NEXT: v_cndmask_b32_e32 v8, v11, v9, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v6
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v7
; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v2
+; CGP-NEXT: v_addc_u32_e32 v10, vcc, 0, v3, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v7
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; CGP-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v9
+; CGP-NEXT: v_addc_u32_e32 v6, vcc, 0, v10, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v10, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v14, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v10, v6, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
@@ -969,14 +969,14 @@ define <2 x i64> @v_udiv_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: s_cbranch_execz .LBB2_6
; CGP-NEXT: .LBB2_8:
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6
-; CGP-NEXT: v_mov_b32_e32 v3, 0
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
-; CGP-NEXT: v_mul_hi_u32 v4, v2, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_mul_hi_u32 v2, v8, v2
+; CGP-NEXT: v_mov_b32_e32 v3, 0
; CGP-NEXT: v_mul_lo_u32 v4, v2, v6
; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v2
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v8, v4
@@ -1019,27 +1019,27 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, 0x1fb03c31
-; CHECK-NEXT: v_mov_b32_e32 v3, 0xd9528440
-; CHECK-NEXT: v_mul_hi_u32 v4, v0, v2
-; CHECK-NEXT: v_mul_lo_u32 v5, v0, v3
+; CHECK-NEXT: v_mov_b32_e32 v4, 0xd9528440
+; CHECK-NEXT: v_mul_hi_u32 v3, v0, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v0, v4
; CHECK-NEXT: v_mul_lo_u32 v6, v1, v2
; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT: v_mul_hi_u32 v0, v0, v3
-; CHECK-NEXT: v_mul_lo_u32 v7, v1, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT: v_mul_hi_u32 v0, v0, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v5, v1, v4
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v2, v0
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT: v_mul_hi_u32 v1, v1, v3
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CHECK-NEXT: v_mul_hi_u32 v1, v1, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 20
; CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1048,56 +1048,107 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) {
}
define <2 x i64> @v_udiv_v2i64_oddk_denom(<2 x i64> %num) {
-; CHECK-LABEL: v_udiv_v2i64_oddk_denom:
-; CHECK: ; %bb.0:
-; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v4, 0x1fb03c31
-; CHECK-NEXT: v_mov_b32_e32 v5, 0xd9528440
-; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4
-; CHECK-NEXT: v_mul_lo_u32 v7, v0, v5
-; CHECK-NEXT: v_mul_lo_u32 v8, v1, v4
-; CHECK-NEXT: v_mul_hi_u32 v9, v1, v4
-; CHECK-NEXT: v_mul_hi_u32 v0, v0, v5
-; CHECK-NEXT: v_mul_lo_u32 v10, v1, v5
-; CHECK-NEXT: v_mul_hi_u32 v1, v1, v5
-; CHECK-NEXT: v_mul_hi_u32 v11, v2, v4
-; CHECK-NEXT: v_mul_lo_u32 v12, v2, v5
-; CHECK-NEXT: v_mul_lo_u32 v13, v3, v4
-; CHECK-NEXT: v_mul_hi_u32 v4, v3, v4
-; CHECK-NEXT: v_mul_hi_u32 v2, v2, v5
-; CHECK-NEXT: v_mul_lo_u32 v14, v3, v5
-; CHECK-NEXT: v_mul_hi_u32 v3, v3, v5
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v4, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v14
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v8
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v11, v9
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v7
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_lshr_b64 v[0:1], v[0:1], 20
-; CHECK-NEXT: v_lshr_b64 v[2:3], v[2:3], 20
-; CHECK-NEXT: s_setpc_b64 s[30:31]
+; GISEL-LABEL: v_udiv_v2i64_oddk_denom:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v4, 0x1fb03c31
+; GISEL-NEXT: v_mov_b32_e32 v6, 0xd9528440
+; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v0, v6
+; GISEL-NEXT: v_mul_lo_u32 v8, v1, v4
+; GISEL-NEXT: v_mul_hi_u32 v0, v0, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v8, v1, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_mul_lo_u32 v7, v1, v6
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_mul_hi_u32 v1, v1, v6
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v7, v2, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v2, v6
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT: v_mul_lo_u32 v5, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_mul_hi_u32 v2, v2, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v3, v3, v6
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; GISEL-NEXT: v_lshr_b64 v[0:1], v[0:1], 20
+; GISEL-NEXT: v_lshr_b64 v[2:3], v[2:3], 20
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; CGP-LABEL: v_udiv_v2i64_oddk_denom:
+; CGP: ; %bb.0:
+; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT: v_mov_b32_e32 v4, 0x1fb03c31
+; CGP-NEXT: v_mov_b32_e32 v6, 0xd9528440
+; CGP-NEXT: v_mul_hi_u32 v5, v0, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v8, v1, v4
+; CGP-NEXT: v_mul_hi_u32 v0, v0, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v8, v1, v4
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_mul_lo_u32 v7, v1, v6
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT: v_mul_hi_u32 v1, v1, v6
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT: v_mul_hi_u32 v5, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v2, v6
+; CGP-NEXT: v_mul_lo_u32 v8, v3, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT: v_mul_hi_u32 v2, v2, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT: v_mul_lo_u32 v7, v3, v6
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v4, v2
+; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; CGP-NEXT: v_mul_hi_u32 v3, v3, v6
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT: v_lshr_b64 v[0:1], v[0:1], 20
+; CGP-NEXT: v_lshr_b64 v[2:3], v[2:3], 20
+; CGP-NEXT: s_setpc_b64 s[30:31]
%result = udiv <2 x i64> %num, <i64 1235195, i64 1235195>
ret <2 x i64> %result
}
@@ -1110,11 +1161,11 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_mov_b32_e32 v4, v1
; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_mov_b32_e32 v7, 0
; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2
-; CHECK-NEXT: v_or_b32_e32 v8, v4, v6
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v5
+; CHECK-NEXT: v_or_b32_e32 v1, v4, v6
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -1127,126 +1178,126 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB7_3:
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v6
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5
-; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v0
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5
+; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
-; CHECK-NEXT: v_trunc_f32_e32 v2, v2
-; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; CHECK-NEXT: v_trunc_f32_e32 v1, v1
+; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT: v_mul_lo_u32 v8, v1, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v1, v0
-; CHECK-NEXT: v_mul_hi_u32 v10, v1, v0
+; CHECK-NEXT: v_mul_lo_u32 v8, v2, v1
+; CHECK-NEXT: v_mul_hi_u32 v9, v2, v0
; CHECK-NEXT: v_mul_lo_u32 v11, v7, v0
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_mul_hi_u32 v10, v0, v9
-; CHECK-NEXT: v_mul_lo_u32 v12, v2, v9
-; CHECK-NEXT: v_mul_hi_u32 v9, v2, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v2, v0
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT: v_mul_hi_u32 v9, v0, v10
; CHECK-NEXT: v_mul_lo_u32 v11, v0, v8
-; CHECK-NEXT: v_mul_hi_u32 v13, v0, v8
-; CHECK-NEXT: v_mul_lo_u32 v14, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v2, v8
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT: v_mul_lo_u32 v12, v1, v10
+; CHECK-NEXT: v_mul_hi_u32 v10, v1, v10
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; CHECK-NEXT: v_mul_lo_u32 v11, v1, v8
; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v12
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v2, v8, vcc
-; CHECK-NEXT: v_mul_lo_u32 v8, v1, v0
-; CHECK-NEXT: v_mul_hi_u32 v9, v1, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; CHECK-NEXT: v_mul_hi_u32 v8, v2, v0
+; CHECK-NEXT: v_mul_lo_u32 v9, v2, v1
; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0
-; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2
-; CHECK-NEXT: v_mul_hi_u32 v10, v0, v8
-; CHECK-NEXT: v_mul_lo_u32 v11, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v2, v8
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v9, v1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v7
-; CHECK-NEXT: v_mul_lo_u32 v7, v0, v1
-; CHECK-NEXT: v_mul_hi_u32 v9, v0, v1
-; CHECK-NEXT: v_mul_lo_u32 v12, v2, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v2, v1
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v2, v0
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v0, v2
+; CHECK-NEXT: v_mul_lo_u32 v9, v0, v7
+; CHECK-NEXT: v_mul_lo_u32 v10, v1, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v10, v0, v7
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; CHECK-NEXT: v_mul_hi_u32 v2, v3, v0
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v8, v3, v1
-; CHECK-NEXT: v_mul_hi_u32 v9, v3, v1
-; CHECK-NEXT: v_mul_lo_u32 v10, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
+; CHECK-NEXT: v_mul_lo_u32 v9, v1, v7
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT: v_mul_hi_u32 v7, v1, v7
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT: v_mul_hi_u32 v2, v3, v0
+; CHECK-NEXT: v_mul_lo_u32 v7, v3, v1
+; CHECK-NEXT: v_mul_lo_u32 v8, v4, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v7, v5, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v0
-; CHECK-NEXT: v_mul_lo_u32 v9, v6, v0
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_mul_lo_u32 v2, v5, v1
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, 1, v0
-; CHECK-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, 1, v10
-; CHECK-NEXT: v_addc_u32_e32 v12, vcc, 0, v11, vcc
+; CHECK-NEXT: v_mul_hi_u32 v2, v5, v0
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
+; CHECK-NEXT: v_mul_lo_u32 v9, v6, v0
+; CHECK-NEXT: v_mul_lo_u32 v8, v5, v0
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v7
+; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v8
; CHECK-NEXT: v_subb_u32_e64 v7, s[4:5], v4, v2, vcc
; CHECK-NEXT: v_sub_i32_e64 v2, s[4:5], v4, v2
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CHECK-NEXT: v_subb_u32_e32 v2, vcc, v2, v6, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
-; CHECK-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v5
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v3, v5
; CHECK-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v6
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, 1, v0
+; CHECK-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
+; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v1, vcc
+; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5
; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v5, v3, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v9, v3, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v7
+; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v2, v10, v8, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v11, v12, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
+; CHECK-NEXT: v_cndmask_b32_e32 v3, v8, v5, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1257,14 +1308,14 @@ define i64 @v_udiv_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: s_cbranch_execz .LBB7_2
; CHECK-NEXT: .LBB7_4:
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5
; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT: v_mul_lo_u32 v2, v2, v0
-; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0
+; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mul_lo_u32 v2, v0, v5
; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v0
; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v3, v2
@@ -1289,242 +1340,242 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_mov_b32_e32 v9, 0x1000
; GISEL-NEXT: v_mov_b32_e32 v10, 0
; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4
-; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6
-; GISEL-NEXT: v_trunc_f32_e32 v9, v9
-; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, v10, v6
-; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT: v_mul_lo_u32 v15, v11, v6
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12
-; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13
-; GISEL-NEXT: v_mul_lo_u32 v16, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v7
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v8
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v7
+; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT: v_mul_lo_u32 v14, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v13, v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v16, v12, v4
+; GISEL-NEXT: v_mul_lo_u32 v15, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT: v_mul_hi_u32 v14, v4, v15
+; GISEL-NEXT: v_mul_lo_u32 v16, v4, v13
+; GISEL-NEXT: v_mul_lo_u32 v17, v5, v15
+; GISEL-NEXT: v_mul_hi_u32 v15, v5, v15
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v17, v4, v13
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v16, v5, v13
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT: v_mul_hi_u32 v13, v5, v13
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT: v_mul_hi_u32 v15, v6, v13
-; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc
+; GISEL-NEXT: v_mul_hi_u32 v13, v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v14, v11, v5
+; GISEL-NEXT: v_mul_lo_u32 v12, v12, v4
+; GISEL-NEXT: v_mul_lo_u32 v11, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v13, v4, v11
+; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12
+; GISEL-NEXT: v_mul_lo_u32 v15, v5, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v15, v4, v12
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_lo_u32 v14, v5, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, v10, v6
-; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v6, v12
-; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v5, v12
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT: v_mul_hi_u32 v13, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; GISEL-NEXT: v_mul_hi_u32 v10, v0, v6
-; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9
-; GISEL-NEXT: v_mul_lo_u32 v12, v1, v6
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v4, v11
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v5, v12, vcc
+; GISEL-NEXT: v_mul_hi_u32 v13, v0, v11
+; GISEL-NEXT: v_mul_lo_u32 v14, v0, v12
+; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v1, v11
+; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v5
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6
-; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9
-; GISEL-NEXT: v_mul_lo_u32 v12, v1, v9
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v1, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v0, v12
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6
+; GISEL-NEXT: v_mul_lo_u32 v10, v1, v12
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v11, v7, v6
-; GISEL-NEXT: v_mul_lo_u32 v12, v7, v9
+; GISEL-NEXT: v_mul_hi_u32 v11, v1, v12
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_mul_hi_u32 v10, v7, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v7, v9
; GISEL-NEXT: v_mul_lo_u32 v13, v8, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], v1, v11, vcc
-; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v11
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v12, v7, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v4
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT: v_subb_u32_e64 v11, s[4:5], v1, v10, vcc
+; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v10
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v10, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, v12, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v7
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v1, vcc, 0, v1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v8
; GISEL-NEXT: v_add_i32_e32 v11, vcc, 1, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v12, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v8
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v7
+; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v15
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v8
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v14
; GISEL-NEXT: v_cndmask_b32_e32 v0, v13, v0, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v11
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v12, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v11
+; GISEL-NEXT: v_addc_u32_e32 v8, vcc, 0, v12, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v7, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v11, v7, vcc
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v1
; GISEL-NEXT: v_trunc_f32_e32 v7, v7
-; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1
; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
-; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v6
-; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v7, v10
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT: v_mul_hi_u32 v13, v6, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v7, v11
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v6
-; GISEL-NEXT: v_mul_lo_u32 v8, v8, v7
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8
-; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT: v_mul_hi_u32 v11, v6, v8
-; GISEL-NEXT: v_mul_lo_u32 v12, v7, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_sub_i32_e64 v11, s[4:5], 0, v4
+; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, v5, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v14, v11, v1
+; GISEL-NEXT: v_mul_lo_u32 v15, v11, v7
+; GISEL-NEXT: v_mul_lo_u32 v17, v13, v1
+; GISEL-NEXT: v_mul_lo_u32 v16, v11, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v12, v8, vcc
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
+; GISEL-NEXT: v_mul_hi_u32 v15, v1, v16
+; GISEL-NEXT: v_mul_lo_u32 v17, v1, v14
+; GISEL-NEXT: v_mul_lo_u32 v12, v7, v16
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v15, v7, v16
+; GISEL-NEXT: v_mul_hi_u32 v16, v1, v14
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v17, v12
+; GISEL-NEXT: v_mul_lo_u32 v17, v7, v14
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17
+; GISEL-NEXT: v_mul_hi_u32 v14, v7, v14
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v15, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v14, vcc
+; GISEL-NEXT: v_mul_hi_u32 v12, v11, v1
+; GISEL-NEXT: v_mul_lo_u32 v14, v11, v7
+; GISEL-NEXT: v_mul_lo_u32 v13, v13, v1
+; GISEL-NEXT: v_mul_lo_u32 v11, v11, v1
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v1, v11
+; GISEL-NEXT: v_mul_lo_u32 v14, v1, v12
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v7, v11
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v10, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v10, v7, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v1, v12
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6
+; GISEL-NEXT: v_mul_lo_u32 v13, v7, v12
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v10, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v11, s[4:5], v11, v13
+; GISEL-NEXT: v_mul_hi_u32 v12, v7, v12
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v10, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v11, v10
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], v12, v10
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v1, v6
+; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v10, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v10, v2, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v2, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
+; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc
-; GISEL-NEXT: v_mul_hi_u32 v8, v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7
-; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_mul_hi_u32 v9, v2, v7
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; GISEL-NEXT: v_mul_lo_u32 v10, v3, v7
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6
-; GISEL-NEXT: v_mul_hi_u32 v9, v4, v6
-; GISEL-NEXT: v_mul_lo_u32 v10, v4, v7
+; GISEL-NEXT: v_mul_hi_u32 v8, v4, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v4, v7
; GISEL-NEXT: v_mul_lo_u32 v11, v5, v6
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v9, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v9
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v10, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
+; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v3, v8, vcc
+; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v8
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v9, v10, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], v2, v4
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v3, vcc, 0, v3, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v5
; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v7, vcc
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v5
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
@@ -1545,143 +1596,143 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-LABEL: v_udiv_v2i64_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_mov_b32_e32 v8, v0
-; CGP-NEXT: v_mov_b32_e32 v9, v1
-; CGP-NEXT: v_mov_b32_e32 v5, v2
-; CGP-NEXT: v_mov_b32_e32 v7, v3
; CGP-NEXT: v_mov_b32_e32 v10, 0x1000
; CGP-NEXT: v_mov_b32_e32 v11, 0
-; CGP-NEXT: v_mov_b32_e32 v0, 0
+; CGP-NEXT: v_mov_b32_e32 v5, v2
+; CGP-NEXT: v_mov_b32_e32 v7, v3
; CGP-NEXT: v_lshl_b64 v[2:3], v[10:11], v4
+; CGP-NEXT: v_mov_b32_e32 v9, v1
+; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
+; CGP-NEXT: v_mov_b32_e32 v8, v0
; CGP-NEXT: v_or_b32_e32 v1, v9, v3
+; CGP-NEXT: v_mov_b32_e32 v0, 0
; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; CGP-NEXT: s_cbranch_execz .LBB8_2
; CGP-NEXT: ; %bb.1:
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v0
; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0
-; CGP-NEXT: v_trunc_f32_e32 v4, v4
-; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; CGP-NEXT: v_trunc_f32_e32 v1, v1
+; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT: v_mul_lo_u32 v13, v1, v4
-; CGP-NEXT: v_mul_lo_u32 v14, v1, v0
-; CGP-NEXT: v_mul_hi_u32 v15, v1, v0
+; CGP-NEXT: v_mul_lo_u32 v13, v4, v1
+; CGP-NEXT: v_mul_hi_u32 v14, v4, v0
; CGP-NEXT: v_mul_lo_u32 v16, v12, v0
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mul_hi_u32 v15, v0, v14
-; CGP-NEXT: v_mul_lo_u32 v17, v4, v14
-; CGP-NEXT: v_mul_hi_u32 v14, v4, v14
+; CGP-NEXT: v_mul_lo_u32 v15, v4, v0
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_mul_hi_u32 v14, v0, v15
; CGP-NEXT: v_mul_lo_u32 v16, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v18, v0, v13
-; CGP-NEXT: v_mul_lo_u32 v19, v4, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v4, v13
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; CGP-NEXT: v_mul_lo_u32 v17, v1, v15
+; CGP-NEXT: v_mul_hi_u32 v15, v1, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v17, v0, v13
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_mul_lo_u32 v16, v1, v13
; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19
; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v17
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT: v_mul_hi_u32 v13, v1, v13
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT: v_addc_u32_e32 v4, vcc, v4, v13, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v1, v0
-; CGP-NEXT: v_mul_hi_u32 v14, v1, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v13, vcc
+; CGP-NEXT: v_mul_hi_u32 v13, v4, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v4, v1
; CGP-NEXT: v_mul_lo_u32 v12, v12, v0
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v0, v13
-; CGP-NEXT: v_mul_lo_u32 v16, v4, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v4, v13
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v14, v1
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12
-; CGP-NEXT: v_mul_lo_u32 v12, v0, v1
-; CGP-NEXT: v_mul_hi_u32 v14, v0, v1
-; CGP-NEXT: v_mul_lo_u32 v17, v4, v1
-; CGP-NEXT: v_mul_hi_u32 v1, v4, v1
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v0
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v13, v0, v4
+; CGP-NEXT: v_mul_lo_u32 v14, v0, v12
+; CGP-NEXT: v_mul_lo_u32 v15, v1, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v15, v0, v12
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; CGP-NEXT: v_mul_hi_u32 v4, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v12, v9, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v1
-; CGP-NEXT: v_mul_hi_u32 v14, v8, v1
-; CGP-NEXT: v_mul_lo_u32 v15, v9, v1
-; CGP-NEXT: v_mul_hi_u32 v1, v9, v1
+; CGP-NEXT: v_mul_lo_u32 v14, v1, v12
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT: v_mul_hi_u32 v12, v1, v12
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_mul_hi_u32 v4, v8, v0
+; CGP-NEXT: v_mul_lo_u32 v12, v8, v1
+; CGP-NEXT: v_mul_lo_u32 v13, v9, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v1
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT: v_mul_lo_u32 v12, v9, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v1, v9, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v12, v2, v0
-; CGP-NEXT: v_mul_hi_u32 v13, v2, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v3, v0
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v2, v1
-; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v0
-; CGP-NEXT: v_addc_u32_e32 v16, vcc, 0, v1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_add_i32_e32 v13, vcc, 1, v15
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, 0, v16, vcc
+; CGP-NEXT: v_mul_hi_u32 v4, v2, v0
+; CGP-NEXT: v_mul_lo_u32 v12, v2, v1
+; CGP-NEXT: v_mul_lo_u32 v14, v3, v0
+; CGP-NEXT: v_mul_lo_u32 v13, v2, v0
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
-; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v12
+; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v13
; CGP-NEXT: v_subb_u32_e64 v12, s[4:5], v9, v4, vcc
; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v9, v4
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
-; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v3
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v3, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v12, v3
-; CGP-NEXT: v_cndmask_b32_e32 v9, v14, v9, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v8, v2
; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v3
+; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v0
+; CGP-NEXT: v_cndmask_b32_e64 v9, v9, v13, s[4:5]
+; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v8, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v3
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; CGP-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v14, v2, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v12
+; CGP-NEXT: v_addc_u32_e32 v4, vcc, 0, v13, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v2, v15, v13, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v2, v12, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v3, v13, v4, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -1695,14 +1746,14 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: s_cbranch_execz .LBB8_4
; CGP-NEXT: ; %bb.3:
; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
-; CGP-NEXT: v_mov_b32_e32 v1, 0
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT: v_mul_lo_u32 v3, v3, v0
-; CGP-NEXT: v_mul_hi_u32 v3, v0, v3
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT: v_mul_lo_u32 v1, v1, v0
+; CGP-NEXT: v_mul_hi_u32 v1, v0, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_mov_b32_e32 v1, 0
; CGP-NEXT: v_mul_lo_u32 v3, v0, v2
; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v0
; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v3
@@ -1715,10 +1766,10 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
; CGP-NEXT: .LBB8_4: ; %.split
; CGP-NEXT: s_or_b64 exec, exec, s[6:7]
+; CGP-NEXT: v_cvt_f32_u32_e32 v4, v9
; CGP-NEXT: v_or_b32_e32 v3, v7, v10
; CGP-NEXT: v_mov_b32_e32 v2, 0
; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, v9
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -1731,126 +1782,126 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: s_setpc_b64 s[30:31]
; CGP-NEXT: .LBB8_7:
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9
-; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v2
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9
+; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2
-; CGP-NEXT: v_trunc_f32_e32 v4, v4
-; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
+; CGP-NEXT: v_trunc_f32_e32 v3, v3
+; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT: v_mul_lo_u32 v8, v3, v4
-; CGP-NEXT: v_mul_lo_u32 v11, v3, v2
-; CGP-NEXT: v_mul_hi_u32 v12, v3, v2
+; CGP-NEXT: v_mul_lo_u32 v8, v4, v3
+; CGP-NEXT: v_mul_hi_u32 v11, v4, v2
; CGP-NEXT: v_mul_lo_u32 v13, v6, v2
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v11
-; CGP-NEXT: v_mul_lo_u32 v14, v4, v11
-; CGP-NEXT: v_mul_hi_u32 v11, v4, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v4, v2
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v11, v2, v12
; CGP-NEXT: v_mul_lo_u32 v13, v2, v8
-; CGP-NEXT: v_mul_hi_u32 v15, v2, v8
-; CGP-NEXT: v_mul_lo_u32 v16, v4, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v4, v8
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v3, v12
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v2, v8
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v13, v3, v8
; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_mul_hi_u32 v8, v3, v8
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT: v_addc_u32_e32 v4, vcc, v4, v8, vcc
-; CGP-NEXT: v_mul_lo_u32 v8, v3, v2
-; CGP-NEXT: v_mul_hi_u32 v11, v3, v2
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc
+; CGP-NEXT: v_mul_hi_u32 v8, v4, v2
+; CGP-NEXT: v_mul_lo_u32 v11, v4, v3
; CGP-NEXT: v_mul_lo_u32 v6, v6, v2
-; CGP-NEXT: v_mul_lo_u32 v3, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v8
-; CGP-NEXT: v_mul_lo_u32 v13, v4, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v4, v8
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_lo_u32 v6, v2, v3
-; CGP-NEXT: v_mul_hi_u32 v11, v2, v3
-; CGP-NEXT: v_mul_lo_u32 v14, v4, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v4, v3
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v11, v2, v6
+; CGP-NEXT: v_mul_lo_u32 v12, v3, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v12, v2, v6
; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; CGP-NEXT: v_mul_hi_u32 v4, v5, v2
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v7, v2
-; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
-; CGP-NEXT: v_mul_hi_u32 v11, v5, v3
-; CGP-NEXT: v_mul_lo_u32 v12, v7, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v3
+; CGP-NEXT: v_mul_lo_u32 v11, v3, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_mul_hi_u32 v6, v3, v6
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; CGP-NEXT: v_mul_hi_u32 v4, v5, v2
+; CGP-NEXT: v_mul_lo_u32 v6, v5, v3
+; CGP-NEXT: v_mul_lo_u32 v8, v7, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v7, v2
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_mul_hi_u32 v8, v5, v3
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CGP-NEXT: v_mul_hi_u32 v3, v7, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT: v_mul_lo_u32 v6, v9, v2
-; CGP-NEXT: v_mul_hi_u32 v8, v9, v2
-; CGP-NEXT: v_mul_lo_u32 v11, v10, v2
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v9, v3
-; CGP-NEXT: v_add_i32_e32 v12, vcc, 1, v2
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v12
-; CGP-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc
+; CGP-NEXT: v_mul_hi_u32 v4, v9, v2
+; CGP-NEXT: v_mul_lo_u32 v6, v9, v3
+; CGP-NEXT: v_mul_lo_u32 v11, v10, v2
+; CGP-NEXT: v_mul_lo_u32 v8, v9, v2
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v8
; CGP-NEXT: v_subb_u32_e64 v6, s[4:5], v7, v4, vcc
; CGP-NEXT: v_sub_i32_e64 v4, s[4:5], v7, v4
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v9
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v10
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
; CGP-NEXT: v_subb_u32_e32 v4, vcc, v4, v10, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v6, v10
-; CGP-NEXT: v_cndmask_b32_e32 v6, v11, v7, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v9
; CGP-NEXT: v_sub_i32_e32 v5, vcc, v5, v9
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v10
; CGP-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v6, v7, v8, s[4:5]
+; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v2
+; CGP-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v5, v9
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v4, v10
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, -1, vcc
; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v10
-; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v11, v5, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, 1, v7
+; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; CGP-NEXT: v_cndmask_b32_e32 v4, v12, v8, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v14, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v4, v7, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v5, v8, v9, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
@@ -1861,14 +1912,14 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: s_cbranch_execz .LBB8_6
; CGP-NEXT: .LBB8_8:
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9
-; CGP-NEXT: v_mov_b32_e32 v3, 0
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
-; CGP-NEXT: v_mul_hi_u32 v4, v2, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_mul_hi_u32 v2, v5, v2
+; CGP-NEXT: v_mov_b32_e32 v3, 0
; CGP-NEXT: v_mul_lo_u32 v4, v2, v9
; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v2
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v5, v4
@@ -1890,10 +1941,10 @@ define i64 @v_udiv_i64_24bit(i64 %num, i64 %den) {
; GISEL-LABEL: v_udiv_i64_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v2
; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -1917,10 +1968,10 @@ define i64 @v_udiv_i64_24bit(i64 %num, i64 %den) {
; CGP-LABEL: v_udiv_i64_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -1950,282 +2001,282 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-LABEL: v_udiv_v2i64_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4
-; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0
-; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6
-; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6
-; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7
-; GISEL-NEXT: v_trunc_f32_e32 v8, v8
-; GISEL-NEXT: v_trunc_f32_e32 v11, v11
-; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12
-; GISEL-NEXT: v_mul_hi_u32 v15, v4, v12
-; GISEL-NEXT: v_mul_lo_u32 v16, v5, v12
-; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v18, v9, v7
-; GISEL-NEXT: v_mul_lo_u32 v19, v10, v7
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; GISEL-NEXT: v_mul_hi_u32 v15, v12, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v14
-; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v6, v16
-; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v17
-; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17
-; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17
-; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19
-; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v16
-; GISEL-NEXT: v_mul_hi_u32 v18, v7, v13
-; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v17, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v16
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v6, v18
+; GISEL-NEXT: v_and_b32_e32 v4, 0xffffff, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v4
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v1, 0
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v1
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v7, v3
+; GISEL-NEXT: v_mul_lo_u32 v12, v8, v3
+; GISEL-NEXT: v_mul_lo_u32 v11, v7, v3
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v10, v3, v11
+; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9
+; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v13, v3, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v3, v10
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc
+; GISEL-NEXT: v_mul_hi_u32 v3, v7, v10
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v8, v10
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v10
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v3, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7
+; GISEL-NEXT: v_mul_lo_u32 v11, v10, v8
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v6
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GISEL-NEXT: v_mul_hi_u32 v7, v0, v6
+; GISEL-NEXT: v_mul_lo_u32 v8, v0, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6
+; GISEL-NEXT: v_mul_hi_u32 v9, v0, v5
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v4, v6
+; GISEL-NEXT: v_mul_lo_u32 v8, v4, v5
+; GISEL-NEXT: v_mul_lo_u32 v10, 0, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, v7, vcc
+; GISEL-NEXT: v_sub_i32_e64 v7, s[4:5], 0, v7
+; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT: v_subbrev_u32_e32 v7, vcc, 0, v7, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, -1, v9, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 1, v6
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v1
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v4
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v9
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, 0, v10, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v9, v7, vcc
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v1
+; GISEL-NEXT: v_trunc_f32_e32 v7, v7
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v7
+; GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v11, vcc
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_mul_hi_u32 v11, v9, v1
+; GISEL-NEXT: v_mul_lo_u32 v12, v9, v7
+; GISEL-NEXT: v_mul_lo_u32 v14, v10, v1
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v1
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT: v_mul_hi_u32 v12, v1, v13
+; GISEL-NEXT: v_mul_lo_u32 v14, v1, v11
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v7, v13
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v8, v6
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6
-; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2
-; GISEL-NEXT: v_mul_hi_u32 v2, v8, v16
-; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v18
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v4, v12
-; GISEL-NEXT: v_mul_hi_u32 v14, v4, v12
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7
-; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
-; GISEL-NEXT: v_mul_hi_u32 v16, v12, v8
-; GISEL-NEXT: v_mul_lo_u32 v17, v2, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v2, v8
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11
-; GISEL-NEXT: v_mul_hi_u32 v18, v7, v13
-; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v10
-; GISEL-NEXT: v_mul_lo_u32 v9, v12, v4
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v4
-; GISEL-NEXT: v_mul_lo_u32 v14, v2, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v2, v4
-; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19
-; GISEL-NEXT: v_mul_hi_u32 v15, v7, v5
-; GISEL-NEXT: v_mul_lo_u32 v19, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v16, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v7, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v1, v11
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v12, v6
+; GISEL-NEXT: v_mul_lo_u32 v12, v7, v11
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v8, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v1, v6
+; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], v7, v8, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v1, v9, v6
+; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
+; GISEL-NEXT: v_mul_lo_u32 v10, v10, v6
+; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v1, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v6, v9
+; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v5, v7, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v6, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GISEL-NEXT: v_mul_hi_u32 v4, v6, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc
-; GISEL-NEXT: v_mul_hi_u32 v9, v0, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7
-; GISEL-NEXT: v_mul_lo_u32 v10, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v11, v6, v2
-; GISEL-NEXT: v_mul_lo_u32 v12, 0, v2
-; GISEL-NEXT: v_mul_hi_u32 v13, 0, v2
-; GISEL-NEXT: v_mul_lo_u32 v2, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v14, v0, v5
-; GISEL-NEXT: v_mul_lo_u32 v15, 0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v2, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_mul_lo_u32 v10, v3, v13
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14
-; GISEL-NEXT: v_mul_lo_u32 v9, v1, v5
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4
-; GISEL-NEXT: v_mul_hi_u32 v8, v3, v4
-; GISEL-NEXT: v_mul_lo_u32 v11, 0, v4
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, 1, v4
-; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v1, v2
-; GISEL-NEXT: v_mul_hi_u32 v16, v1, v2
-; GISEL-NEXT: v_mul_lo_u32 v17, 0, v2
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, 1, v2
-; GISEL-NEXT: v_addc_u32_e32 v19, vcc, 0, v5, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v12
-; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v14, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v17
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, v8, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[4:5], v0, v15
-; GISEL-NEXT: v_subb_u32_e64 v15, s[6:7], 0, v9, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v7
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[8:9], 0, v15
-; GISEL-NEXT: v_add_i32_e64 v7, s[10:11], 1, v18
-; GISEL-NEXT: v_addc_u32_e64 v15, s[10:11], 0, v19, s[10:11]
-; GISEL-NEXT: v_sub_i32_e64 v8, s[10:11], 0, v8
-; GISEL-NEXT: v_sub_i32_e64 v9, s[10:11], 0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v11, -1, v11, s[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v17, -1, v17, s[8:9]
-; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v9, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v6, v3
-; GISEL-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v8, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v9, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
-; GISEL-NEXT: v_cndmask_b32_e32 v1, -1, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v5
+; GISEL-NEXT: v_mul_lo_u32 v9, 0, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], 0, v6, vcc
+; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], 0, v6
+; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v7
+; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v7, -1, v8, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v12, v10, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v18, v7, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v14, v16, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v19, v15, s[4:5]
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v13, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v5, v4, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT: v_addc_u32_e32 v6, vcc, 0, v9, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v6, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_udiv_v2i64_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4
+; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1
+; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v1
+; CGP-NEXT: v_rcp_f32_e32 v3, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2
-; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4
-; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
-; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
-; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT: v_mul_hi_u32 v6, v0, v4
-; CGP-NEXT: v_mul_lo_u32 v4, 0, v4
-; CGP-NEXT: v_mul_hi_u32 v7, v1, v5
-; CGP-NEXT: v_mul_lo_u32 v5, 0, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mul_lo_u32 v6, v4, v2
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
-; CGP-NEXT: v_add_i32_e32 v9, vcc, 1, v5
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
+; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v3
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v3, v6
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT: v_mul_hi_u32 v6, v0, v3
+; CGP-NEXT: v_mul_lo_u32 v3, 0, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v3, v1
+; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v8
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
-; CGP-NEXT: v_sub_i32_e64 v6, s[4:5], v0, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v3
-; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v9, s[4:5]
-; CGP-NEXT: v_sub_i32_e64 v7, s[6:7], v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v5
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v2, v5, v7, vcc
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v7
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
+; CGP-NEXT: v_sub_i32_e64 v7, s[4:5], v0, v1
+; CGP-NEXT: v_cndmask_b32_e32 v3, v3, v8, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT: v_mul_hi_u32 v6, v2, v5
+; CGP-NEXT: v_mul_lo_u32 v5, 0, v5
+; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT: v_mul_lo_u32 v6, v5, v4
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
+; CGP-NEXT: v_cndmask_b32_e32 v0, v3, v7, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v2, v6
+; CGP-NEXT: v_add_i32_e32 v2, vcc, 1, v5
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; CGP-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
+; CGP-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, 1, v2
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; CGP-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; CGP-NEXT: v_mov_b32_e32 v1, 0
; CGP-NEXT: v_mov_b32_e32 v3, 0
; CGP-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index d733ad9b01405..cd4605256d2d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,GISEL %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal < %s | FileCheck -check-prefixes=CHECK,CGP %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 < %s | FileCheck -check-prefixes=CHECK,GISEL %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 < %s | FileCheck -check-prefixes=CHECK,CGP %s
; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
@@ -123,78 +123,78 @@ define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v5, v7
; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_urem_v2i32:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v7
; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v5, v3
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = urem <2 x i32> %num, %den
ret <2 x i32> %result
@@ -244,20 +244,20 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, 0xb2a50881
-; CHECK-NEXT: v_mov_b32_e32 v3, 0xffed2705
-; CHECK-NEXT: v_mul_hi_u32 v4, v0, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, v0, v2
; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v0, v4
-; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v1, v2
+; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v0, v3
+; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v1, v2
+; CHECK-NEXT: v_lshrrev_b32_e32 v4, 1, v4
; CHECK-NEXT: v_lshrrev_b32_e32 v5, 1, v5
-; CHECK-NEXT: v_lshrrev_b32_e32 v6, 1, v6
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CHECK-NEXT: v_lshrrev_b32_e32 v4, 20, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v5, v2
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 20, v3
+; CHECK-NEXT: v_mov_b32_e32 v4, 0xffed2705
; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2
-; CHECK-NEXT: v_mul_lo_u32 v4, v4, v3
-; CHECK-NEXT: v_mul_lo_u32 v2, v2, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CHECK-NEXT: v_mul_lo_u32 v3, v3, v4
+; CHECK-NEXT: v_mul_lo_u32 v2, v2, v4
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = urem <2 x i32> %num, <i32 1235195, i32 1235195>
@@ -297,82 +297,82 @@ define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
-; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v5, v7
; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_urem_v2i32_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CGP-NEXT: v_lshl_b32_e32 v2, 0x1000, v2
-; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; CGP-NEXT: v_lshl_b32_e32 v3, 0x1000, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v4
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v7
; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v5, v3
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
%r = urem <2 x i32> %x, %shl.y
@@ -383,10 +383,10 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
; GISEL-LABEL: v_urem_i32_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -407,10 +407,10 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
; CGP-LABEL: v_urem_i32_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -437,87 +437,87 @@ define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
; GISEL-LABEL: v_urem_v2i32_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v3
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT: v_mul_lo_u32 v6, v6, v4
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v4, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; GISEL-NEXT: v_mul_hi_u32 v4, v0, v4
-; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, v5, v7
; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_mul_hi_u32 v5, v1, v5
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; GISEL-NEXT: v_mul_lo_u32 v5, v5, v3
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_urem_v2i32_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
-; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
+; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v3
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v4
+; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v1
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v4, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
; CGP-NEXT: v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v7
; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v3
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; CGP-NEXT: v_mul_hi_u32 v5, v1, v5
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v5, v3
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v3
; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%num.mask = and <2 x i32> %num, <i32 16777215, i32 16777215>
%den.mask = and <2 x i32> %den, <i32 16777215, i32 16777215>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 7af8cd7af1660..47544a4c11afb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts < %s | FileCheck -check-prefixes=CHECK,GISEL %s
-; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts < %s | FileCheck -check-prefixes=CHECK,CGP %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=1 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts < %s | FileCheck -check-prefixes=CHECK,GISEL %s
+; RUN: llc -global-isel -amdgpu-codegenprepare-disable-idiv-expansion=0 -mtriple=amdgcn-amd-amdpal -mcpu=gfx600 -denormal-fp-math-f32=preserve-sign -mattr=+mad-mac-f32-insts < %s | FileCheck -check-prefixes=CHECK,CGP %s
; The same 32-bit expansion is implemented in the legalizer and in AMDGPUCodeGenPrepare.
@@ -8,12 +8,12 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
; CHECK-LABEL: v_urem_i64:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_mov_b32_e32 v4, v0
; CHECK-NEXT: v_mov_b32_e32 v5, v1
+; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2
+; CHECK-NEXT: v_mov_b32_e32 v4, v0
; CHECK-NEXT: v_or_b32_e32 v1, v5, v3
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; CHECK-NEXT: v_cvt_f32_u32_e32 v6, v2
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -26,123 +26,123 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB0_3:
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v3
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
-; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
; CHECK-NEXT: v_mac_f32_e32 v6, 0x4f800000, v0
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6
+; CHECK-NEXT: v_sub_i32_e32 v6, vcc, 0, v2
+; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v3, vcc
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT: v_mul_f32_e32 v6, 0x2f800000, v0
-; CHECK-NEXT: v_trunc_f32_e32 v6, v6
-; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6
-; CHECK-NEXT: v_cvt_u32_f32_e32 v6, v6
+; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; CHECK-NEXT: v_trunc_f32_e32 v1, v1
+; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT: v_mul_lo_u32 v8, v1, v6
-; CHECK-NEXT: v_mul_lo_u32 v9, v1, v0
-; CHECK-NEXT: v_mul_hi_u32 v10, v1, v0
+; CHECK-NEXT: v_mul_lo_u32 v8, v6, v1
+; CHECK-NEXT: v_mul_hi_u32 v9, v6, v0
; CHECK-NEXT: v_mul_lo_u32 v11, v7, v0
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_mul_hi_u32 v10, v0, v9
-; CHECK-NEXT: v_mul_lo_u32 v12, v6, v9
-; CHECK-NEXT: v_mul_hi_u32 v9, v6, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v6, v0
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT: v_mul_hi_u32 v9, v0, v10
; CHECK-NEXT: v_mul_lo_u32 v11, v0, v8
-; CHECK-NEXT: v_mul_hi_u32 v13, v0, v8
-; CHECK-NEXT: v_mul_lo_u32 v14, v6, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT: v_mul_lo_u32 v12, v1, v10
+; CHECK-NEXT: v_mul_hi_u32 v10, v1, v10
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; CHECK-NEXT: v_mul_lo_u32 v11, v1, v8
; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v12
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_addc_u32_e32 v6, vcc, v6, v8, vcc
-; CHECK-NEXT: v_mul_lo_u32 v8, v1, v0
-; CHECK-NEXT: v_mul_hi_u32 v9, v1, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; CHECK-NEXT: v_mul_hi_u32 v8, v6, v0
+; CHECK-NEXT: v_mul_lo_u32 v9, v6, v1
; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0
-; CHECK-NEXT: v_mul_lo_u32 v1, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v10, v0, v8
-; CHECK-NEXT: v_mul_lo_u32 v11, v6, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v6, v8
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v9, v1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v7
-; CHECK-NEXT: v_mul_lo_u32 v7, v0, v1
-; CHECK-NEXT: v_mul_hi_u32 v9, v0, v1
-; CHECK-NEXT: v_mul_lo_u32 v12, v6, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v6, v1
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v6, v6, v0
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v0, v6
+; CHECK-NEXT: v_mul_lo_u32 v9, v0, v7
+; CHECK-NEXT: v_mul_lo_u32 v10, v1, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v1, v6
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v10, v0, v7
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v6, v1, vcc
-; CHECK-NEXT: v_mul_hi_u32 v6, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v7, v5, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
-; CHECK-NEXT: v_mul_lo_u32 v8, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v9, v4, v1
-; CHECK-NEXT: v_mul_lo_u32 v10, v5, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v1
+; CHECK-NEXT: v_mul_lo_u32 v9, v1, v7
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT: v_mul_hi_u32 v7, v1, v7
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT: v_mul_hi_u32 v6, v4, v0
+; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
+; CHECK-NEXT: v_mul_lo_u32 v8, v5, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v5, v0
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; CHECK-NEXT: v_mul_hi_u32 v8, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v6
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v1, v5, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT: v_mul_lo_u32 v7, v2, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v2, v0
-; CHECK-NEXT: v_mul_lo_u32 v0, v3, v0
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v6
+; CHECK-NEXT: v_mul_hi_u32 v6, v2, v0
; CHECK-NEXT: v_mul_lo_u32 v1, v2, v1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v2, v0
+; CHECK-NEXT: v_mul_lo_u32 v0, v3, v0
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v6, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v4, v7
; CHECK-NEXT: v_subb_u32_e64 v4, s[4:5], v5, v0, vcc
; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v5, v0
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2
; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v3
; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
; CHECK-NEXT: v_sub_i32_e32 v6, vcc, v1, v2
; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v2
; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v3
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CHECK-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v3, v7, v0, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
@@ -155,14 +155,14 @@ define i64 @v_urem_i64(i64 %num, i64 %den) {
; CHECK-NEXT: s_cbranch_execz .LBB0_2
; CHECK-NEXT: .LBB0_4:
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v6
-; CHECK-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT: v_mul_lo_u32 v3, v3, v0
-; CHECK-NEXT: v_mul_hi_u32 v3, v0, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0
+; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mul_lo_u32 v0, v0, v2
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
; CHECK-NEXT: v_sub_i32_e32 v3, vcc, v0, v2
@@ -185,134 +185,134 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_or_b64 s[4:5], s[0:1], s[2:3]
; CHECK-NEXT: s_mov_b32 s4, 0
+; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2
; CHECK-NEXT: v_cmp_ne_u64_e64 s[4:5], s[4:5], 0
-; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
; CHECK-NEXT: s_mov_b32 s6, 1
-; CHECK-NEXT: v_cvt_f32_u32_e32 v0, s2
+; CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_scc0 .LBB1_2
; CHECK-NEXT: ; %bb.1:
; CHECK-NEXT: v_cvt_f32_u32_e32 v1, s3
; CHECK-NEXT: s_sub_u32 s4, 0, s2
-; CHECK-NEXT: v_mov_b32_e32 v2, s1
-; CHECK-NEXT: v_madmk_f32 v1, v1, 0x4f800000, v0
; CHECK-NEXT: s_subb_u32 s5, 0, s3
+; CHECK-NEXT: v_madmk_f32 v1, v1, 0x4f800000, v0
; CHECK-NEXT: v_rcp_iflag_f32_e32 v1, v1
; CHECK-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v1
-; CHECK-NEXT: v_trunc_f32_e32 v3, v3
-; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v3
-; CHECK-NEXT: v_cvt_u32_f32_e32 v3, v3
+; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v1
+; CHECK-NEXT: v_trunc_f32_e32 v2, v2
+; CHECK-NEXT: v_mac_f32_e32 v1, 0xcf800000, v2
; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT: v_readfirstlane_b32 s6, v3
+; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT: v_mul_hi_u32 v3, s4, v1
; CHECK-NEXT: v_readfirstlane_b32 s7, v1
-; CHECK-NEXT: v_mul_hi_u32 v4, s4, v1
-; CHECK-NEXT: s_mul_i32 s8, s4, s6
; CHECK-NEXT: s_mul_i32 s9, s4, s7
-; CHECK-NEXT: v_readfirstlane_b32 s10, v4
-; CHECK-NEXT: s_mul_i32 s11, s5, s7
-; CHECK-NEXT: s_add_i32 s8, s10, s8
+; CHECK-NEXT: v_readfirstlane_b32 s6, v2
; CHECK-NEXT: v_mul_hi_u32 v4, v1, s9
-; CHECK-NEXT: s_mul_i32 s10, s6, s9
-; CHECK-NEXT: v_mul_hi_u32 v5, v3, s9
-; CHECK-NEXT: s_add_i32 s8, s8, s11
-; CHECK-NEXT: v_readfirstlane_b32 s9, v4
-; CHECK-NEXT: v_readfirstlane_b32 s11, v5
+; CHECK-NEXT: s_mul_i32 s8, s4, s6
+; CHECK-NEXT: v_readfirstlane_b32 s12, v3
+; CHECK-NEXT: s_mul_i32 s10, s5, s7
+; CHECK-NEXT: s_add_i32 s8, s12, s8
+; CHECK-NEXT: s_add_i32 s8, s8, s10
+; CHECK-NEXT: v_readfirstlane_b32 s10, v4
; CHECK-NEXT: s_mul_i32 s12, s7, s8
+; CHECK-NEXT: v_mul_hi_u32 v3, v2, s9
; CHECK-NEXT: v_mul_hi_u32 v1, v1, s8
-; CHECK-NEXT: s_mul_i32 s13, s6, s8
-; CHECK-NEXT: v_mul_hi_u32 v3, v3, s8
-; CHECK-NEXT: s_add_u32 s8, s9, s12
-; CHECK-NEXT: v_readfirstlane_b32 s9, v1
-; CHECK-NEXT: v_readfirstlane_b32 s12, v3
-; CHECK-NEXT: s_cselect_b32 s14, 1, 0
-; CHECK-NEXT: s_add_u32 s8, s8, s10
-; CHECK-NEXT: s_cselect_b32 s8, 1, 0
-; CHECK-NEXT: s_add_i32 s14, s14, s8
-; CHECK-NEXT: s_add_u32 s8, s11, s9
+; CHECK-NEXT: s_mul_i32 s11, s6, s9
+; CHECK-NEXT: s_add_u32 s10, s10, s12
+; CHECK-NEXT: s_cselect_b32 s12, 1, 0
+; CHECK-NEXT: s_add_u32 s10, s10, s11
; CHECK-NEXT: s_cselect_b32 s9, 1, 0
-; CHECK-NEXT: s_add_u32 s8, s8, s13
-; CHECK-NEXT: s_cselect_b32 s10, 1, 0
-; CHECK-NEXT: s_add_i32 s9, s9, s10
-; CHECK-NEXT: s_add_u32 s8, s8, s14
+; CHECK-NEXT: s_add_i32 s12, s12, s9
+; CHECK-NEXT: v_readfirstlane_b32 s9, v3
+; CHECK-NEXT: v_readfirstlane_b32 s10, v1
+; CHECK-NEXT: s_mul_i32 s11, s6, s8
+; CHECK-NEXT: s_add_u32 s9, s9, s10
; CHECK-NEXT: s_cselect_b32 s10, 1, 0
+; CHECK-NEXT: s_add_u32 s9, s9, s11
+; CHECK-NEXT: s_cselect_b32 s11, 1, 0
+; CHECK-NEXT: v_mul_hi_u32 v1, v2, s8
+; CHECK-NEXT: s_add_i32 s10, s10, s11
+; CHECK-NEXT: s_add_u32 s8, s9, s12
+; CHECK-NEXT: s_cselect_b32 s9, 1, 0
+; CHECK-NEXT: s_add_i32 s10, s10, s9
+; CHECK-NEXT: v_readfirstlane_b32 s9, v1
; CHECK-NEXT: s_add_i32 s9, s9, s10
-; CHECK-NEXT: s_add_i32 s12, s12, s9
; CHECK-NEXT: s_add_u32 s7, s7, s8
-; CHECK-NEXT: s_addc_u32 s6, s6, s12
-; CHECK-NEXT: s_mul_i32 s8, s4, s7
; CHECK-NEXT: v_mov_b32_e32 v1, s7
; CHECK-NEXT: v_mul_hi_u32 v1, s4, v1
-; CHECK-NEXT: s_mul_i32 s5, s5, s7
-; CHECK-NEXT: v_readfirstlane_b32 s9, v1
+; CHECK-NEXT: s_mul_i32 s8, s4, s7
+; CHECK-NEXT: s_addc_u32 s6, s6, s9
; CHECK-NEXT: s_mul_i32 s4, s4, s6
+; CHECK-NEXT: v_readfirstlane_b32 s9, v1
; CHECK-NEXT: v_mov_b32_e32 v1, s8
; CHECK-NEXT: v_mul_hi_u32 v1, s7, v1
-; CHECK-NEXT: s_mul_i32 s10, s6, s8
-; CHECK-NEXT: v_mov_b32_e32 v3, s6
-; CHECK-NEXT: v_mul_hi_u32 v3, v3, s8
+; CHECK-NEXT: s_mul_i32 s5, s5, s7
; CHECK-NEXT: s_add_i32 s4, s9, s4
-; CHECK-NEXT: v_readfirstlane_b32 s8, v1
-; CHECK-NEXT: v_readfirstlane_b32 s9, v3
; CHECK-NEXT: s_add_i32 s4, s4, s5
-; CHECK-NEXT: s_mul_i32 s5, s7, s4
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: v_mul_hi_u32 v3, s7, v1
-; CHECK-NEXT: s_mul_i32 s4, s6, s4
-; CHECK-NEXT: v_mul_hi_u32 v1, s6, v1
-; CHECK-NEXT: s_add_u32 s5, s8, s5
-; CHECK-NEXT: v_readfirstlane_b32 s8, v3
-; CHECK-NEXT: v_readfirstlane_b32 s11, v1
-; CHECK-NEXT: s_cselect_b32 s12, 1, 0
+; CHECK-NEXT: v_readfirstlane_b32 s5, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, s6
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: s_mul_i32 s9, s7, s4
+; CHECK-NEXT: v_mul_hi_u32 v1, v1, s8
+; CHECK-NEXT: v_mul_hi_u32 v3, s7, v2
+; CHECK-NEXT: s_mul_i32 s10, s6, s8
+; CHECK-NEXT: s_add_u32 s5, s5, s9
+; CHECK-NEXT: s_cselect_b32 s9, 1, 0
; CHECK-NEXT: s_add_u32 s5, s5, s10
; CHECK-NEXT: s_cselect_b32 s5, 1, 0
-; CHECK-NEXT: s_add_i32 s12, s12, s5
-; CHECK-NEXT: s_add_u32 s5, s9, s8
+; CHECK-NEXT: s_add_i32 s9, s9, s5
+; CHECK-NEXT: v_readfirstlane_b32 s5, v1
+; CHECK-NEXT: v_readfirstlane_b32 s8, v3
+; CHECK-NEXT: s_mul_i32 s4, s6, s4
+; CHECK-NEXT: s_add_u32 s5, s5, s8
; CHECK-NEXT: s_cselect_b32 s8, 1, 0
; CHECK-NEXT: s_add_u32 s4, s5, s4
; CHECK-NEXT: s_cselect_b32 s5, 1, 0
+; CHECK-NEXT: v_mul_hi_u32 v1, s6, v2
; CHECK-NEXT: s_add_i32 s5, s8, s5
-; CHECK-NEXT: s_add_u32 s4, s4, s12
+; CHECK-NEXT: s_add_u32 s4, s4, s9
; CHECK-NEXT: s_cselect_b32 s8, 1, 0
; CHECK-NEXT: s_add_i32 s5, s5, s8
-; CHECK-NEXT: s_add_i32 s11, s11, s5
+; CHECK-NEXT: v_readfirstlane_b32 s8, v1
+; CHECK-NEXT: s_add_i32 s8, s8, s5
; CHECK-NEXT: s_add_u32 s4, s7, s4
-; CHECK-NEXT: s_addc_u32 s5, s6, s11
; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: v_mul_hi_u32 v1, s0, v1
-; CHECK-NEXT: s_mul_i32 s6, s1, s4
-; CHECK-NEXT: v_mul_hi_u32 v2, v2, s4
-; CHECK-NEXT: v_readfirstlane_b32 s4, v1
+; CHECK-NEXT: s_addc_u32 s5, s6, s8
+; CHECK-NEXT: v_mov_b32_e32 v2, s5
; CHECK-NEXT: s_mul_i32 s7, s0, s5
-; CHECK-NEXT: v_readfirstlane_b32 s8, v2
-; CHECK-NEXT: v_mov_b32_e32 v1, s5
-; CHECK-NEXT: v_mul_hi_u32 v2, s0, v1
+; CHECK-NEXT: v_readfirstlane_b32 s6, v1
+; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_mul_hi_u32 v1, v1, s4
+; CHECK-NEXT: v_mul_hi_u32 v3, s0, v2
+; CHECK-NEXT: s_mul_i32 s8, s1, s4
+; CHECK-NEXT: s_add_u32 s6, s6, s7
+; CHECK-NEXT: s_cselect_b32 s7, 1, 0
+; CHECK-NEXT: s_add_u32 s6, s6, s8
+; CHECK-NEXT: s_cselect_b32 s4, 1, 0
+; CHECK-NEXT: s_add_i32 s7, s7, s4
+; CHECK-NEXT: v_readfirstlane_b32 s4, v1
+; CHECK-NEXT: v_readfirstlane_b32 s6, v3
; CHECK-NEXT: s_mul_i32 s5, s1, s5
-; CHECK-NEXT: v_mul_hi_u32 v1, s1, v1
-; CHECK-NEXT: s_add_u32 s4, s4, s7
-; CHECK-NEXT: v_readfirstlane_b32 s7, v2
-; CHECK-NEXT: v_readfirstlane_b32 s9, v1
-; CHECK-NEXT: s_cselect_b32 s10, 1, 0
; CHECK-NEXT: s_add_u32 s4, s4, s6
-; CHECK-NEXT: s_cselect_b32 s4, 1, 0
-; CHECK-NEXT: s_add_i32 s10, s10, s4
-; CHECK-NEXT: s_add_u32 s4, s8, s7
; CHECK-NEXT: s_cselect_b32 s6, 1, 0
; CHECK-NEXT: s_add_u32 s4, s4, s5
; CHECK-NEXT: s_cselect_b32 s5, 1, 0
; CHECK-NEXT: s_add_i32 s5, s6, s5
-; CHECK-NEXT: s_add_u32 s4, s4, s10
+; CHECK-NEXT: s_add_u32 s4, s4, s7
+; CHECK-NEXT: v_mul_hi_u32 v1, s1, v2
+; CHECK-NEXT: v_mov_b32_e32 v2, s4
+; CHECK-NEXT: v_mul_hi_u32 v2, s2, v2
; CHECK-NEXT: s_cselect_b32 s6, 1, 0
-; CHECK-NEXT: s_mul_i32 s7, s2, s4
-; CHECK-NEXT: v_mov_b32_e32 v1, s4
-; CHECK-NEXT: v_mul_hi_u32 v1, s2, v1
-; CHECK-NEXT: s_mul_i32 s4, s3, s4
; CHECK-NEXT: s_add_i32 s5, s5, s6
; CHECK-NEXT: v_readfirstlane_b32 s6, v1
-; CHECK-NEXT: s_add_i32 s9, s9, s5
-; CHECK-NEXT: s_mul_i32 s5, s2, s9
-; CHECK-NEXT: s_add_i32 s5, s6, s5
-; CHECK-NEXT: s_add_i32 s6, s5, s4
-; CHECK-NEXT: s_sub_u32 s4, s0, s7
+; CHECK-NEXT: s_add_i32 s6, s6, s5
+; CHECK-NEXT: v_readfirstlane_b32 s7, v2
+; CHECK-NEXT: s_mul_i32 s6, s2, s6
+; CHECK-NEXT: s_mul_i32 s5, s2, s4
+; CHECK-NEXT: s_mul_i32 s4, s3, s4
+; CHECK-NEXT: s_add_i32 s6, s7, s6
+; CHECK-NEXT: s_add_i32 s6, s6, s4
+; CHECK-NEXT: s_sub_u32 s4, s0, s5
; CHECK-NEXT: s_cselect_b32 s7, 1, 0
; CHECK-NEXT: s_subb_u32 s5, s1, s6
; CHECK-NEXT: s_sub_i32 s1, s1, s6
@@ -364,21 +364,21 @@ define amdgpu_ps i64 @s_urem_i64(i64 inreg %num, i64 inreg %den) {
; CHECK-NEXT: s_add_i32 s1, s3, s1
; CHECK-NEXT: v_mov_b32_e32 v0, s1
; CHECK-NEXT: v_mul_hi_u32 v0, s0, v0
-; CHECK-NEXT: v_readfirstlane_b32 s1, v0
-; CHECK-NEXT: s_mul_i32 s1, s1, s2
-; CHECK-NEXT: s_sub_i32 s0, s0, s1
-; CHECK-NEXT: s_cmp_ge_u32 s0, s2
-; CHECK-NEXT: s_cselect_b32 s1, 1, 0
-; CHECK-NEXT: s_sub_i32 s3, s0, s2
-; CHECK-NEXT: s_cmp_lg_u32 s1, 0
-; CHECK-NEXT: s_cselect_b32 s0, s3, s0
+; CHECK-NEXT: s_mov_b32 s1, 0
+; CHECK-NEXT: s_mov_b32 s5, s1
+; CHECK-NEXT: v_readfirstlane_b32 s3, v0
+; CHECK-NEXT: s_mul_i32 s3, s3, s2
+; CHECK-NEXT: s_sub_i32 s0, s0, s3
; CHECK-NEXT: s_cmp_ge_u32 s0, s2
-; CHECK-NEXT: s_cselect_b32 s4, 1, 0
-; CHECK-NEXT: s_mov_b32 s3, 0
-; CHECK-NEXT: s_sub_i32 s2, s0, s2
-; CHECK-NEXT: s_mov_b32 s1, s3
-; CHECK-NEXT: s_cmp_lg_u32 s4, 0
-; CHECK-NEXT: s_cselect_b64 s[4:5], s[2:3], s[0:1]
+; CHECK-NEXT: s_cselect_b32 s3, 1, 0
+; CHECK-NEXT: s_sub_i32 s4, s0, s2
+; CHECK-NEXT: s_cmp_lg_u32 s3, 0
+; CHECK-NEXT: s_cselect_b32 s4, s4, s0
+; CHECK-NEXT: s_cmp_ge_u32 s4, s2
+; CHECK-NEXT: s_cselect_b32 s3, 1, 0
+; CHECK-NEXT: s_sub_i32 s0, s4, s2
+; CHECK-NEXT: s_cmp_lg_u32 s3, 0
+; CHECK-NEXT: s_cselect_b64 s[4:5], s[0:1], s[4:5]
; CHECK-NEXT: .LBB1_5: ; %.split
; CHECK-NEXT: s_mov_b32 s0, s4
; CHECK-NEXT: s_mov_b32 s1, s4
@@ -401,6 +401,8 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v4
; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v5
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v9
; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v8
; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8
@@ -409,94 +411,95 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v9
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v4
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT: v_mul_hi_u32 v13, v10, v8
-; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
+; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9
; GISEL-NEXT: v_mul_lo_u32 v15, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_mul_lo_u32 v14, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v14
+; GISEL-NEXT: v_mul_lo_u32 v15, v8, v12
+; GISEL-NEXT: v_mul_lo_u32 v16, v9, v14
+; GISEL-NEXT: v_mul_hi_u32 v14, v9, v14
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT: v_mul_hi_u32 v14, v8, v12
-; GISEL-NEXT: v_mul_lo_u32 v15, v8, v13
-; GISEL-NEXT: v_mul_lo_u32 v16, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v16, v8, v12
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT: v_mul_lo_u32 v15, v9, v12
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT: v_mul_hi_u32 v15, v8, v13
-; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v8
-; GISEL-NEXT: v_mul_hi_u32 v13, v10, v8
-; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v13
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v12, vcc
+; GISEL-NEXT: v_mul_hi_u32 v12, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v9
; GISEL-NEXT: v_mul_lo_u32 v11, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v12
-; GISEL-NEXT: v_mul_lo_u32 v13, v8, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT: v_mul_hi_u32 v13, v8, v10
+; GISEL-NEXT: v_mul_lo_u32 v10, v10, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v7
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v8, v11
; GISEL-NEXT: v_mul_lo_u32 v14, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v14, v8, v11
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_hi_u32 v11, v9, v11
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
+; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v11, vcc
; GISEL-NEXT: v_mul_hi_u32 v10, v0, v8
; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9
; GISEL-NEXT: v_mul_lo_u32 v12, v1, v8
+; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
+; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v6
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v12, v0, v9
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v8, v1, v8
-; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9
-; GISEL-NEXT: v_mul_lo_u32 v12, v1, v9
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v11, v1, v9
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT: v_mul_lo_u32 v10, v4, v8
-; GISEL-NEXT: v_mul_hi_u32 v11, v4, v8
+; GISEL-NEXT: v_mul_hi_u32 v10, v4, v8
; GISEL-NEXT: v_mul_lo_u32 v9, v4, v9
+; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8
; GISEL-NEXT: v_mul_lo_u32 v8, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v15
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v11
; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v8, vcc
; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v8
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5
@@ -504,123 +507,120 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v5
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v4
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v0, v4
+; GISEL-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v5
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v14
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v5, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
+; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v5
+; GISEL-NEXT: v_trunc_f32_e32 v10, v10
+; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v10
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10
+; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 0, v6
+; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, v7, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v14, v12, v5
+; GISEL-NEXT: v_mul_lo_u32 v15, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v17, v13, v5
+; GISEL-NEXT: v_mul_lo_u32 v16, v12, v5
; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
+; GISEL-NEXT: v_mul_hi_u32 v15, v5, v16
+; GISEL-NEXT: v_mul_lo_u32 v17, v5, v14
+; GISEL-NEXT: v_mul_lo_u32 v11, v10, v16
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v15, v10, v16
+; GISEL-NEXT: v_mul_hi_u32 v16, v5, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11
+; GISEL-NEXT: v_mul_lo_u32 v17, v10, v14
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17
+; GISEL-NEXT: v_mul_hi_u32 v14, v10, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc
+; GISEL-NEXT: v_mul_hi_u32 v11, v12, v5
+; GISEL-NEXT: v_mul_lo_u32 v14, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v13, v5
+; GISEL-NEXT: v_mul_lo_u32 v12, v12, v5
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v5, v12
+; GISEL-NEXT: v_mul_lo_u32 v14, v5, v11
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, v10, v12
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v8, v10, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v5, v11
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v13, v4
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v11
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v11, v10, v11
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v12, v8
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v11, v8
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v5, v4
+; GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], v10, v8, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v8, v2, v4
+; GISEL-NEXT: v_mul_lo_u32 v10, v2, v5
; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v7
-; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v5, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
-; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v6
-; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v7, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v12, v8, v5
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v4
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v4, v10
-; GISEL-NEXT: v_mul_lo_u32 v13, v4, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v5, v10
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10
-; GISEL-NEXT: v_mul_hi_u32 v13, v4, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v5, v11
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v11, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v8, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v4, v10
-; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8
-; GISEL-NEXT: v_mul_lo_u32 v12, v5, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT: v_mul_hi_u32 v10, v5, v10
-; GISEL-NEXT: v_mul_hi_u32 v11, v4, v8
-; GISEL-NEXT: v_mul_lo_u32 v12, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v8, vcc
-; GISEL-NEXT: v_mul_hi_u32 v8, v2, v4
-; GISEL-NEXT: v_mul_lo_u32 v9, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v10, v3, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
; GISEL-NEXT: v_mul_hi_u32 v9, v2, v5
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; GISEL-NEXT: v_mul_lo_u32 v10, v3, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, v6, v4
-; GISEL-NEXT: v_mul_hi_u32 v9, v6, v4
+; GISEL-NEXT: v_mul_hi_u32 v8, v6, v4
; GISEL-NEXT: v_mul_lo_u32 v5, v6, v5
+; GISEL-NEXT: v_mul_lo_u32 v9, v6, v4
; GISEL-NEXT: v_mul_lo_u32 v4, v7, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9
; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v3, v4, vcc
; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v4
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v7
@@ -628,18 +628,18 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v7
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v6
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v6
+; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7
; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v7, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
@@ -652,137 +652,137 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-LABEL: v_urem_v2i64:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_mov_b32_e32 v10, v0
; CGP-NEXT: v_mov_b32_e32 v11, v1
; CGP-NEXT: v_mov_b32_e32 v8, v2
-; CGP-NEXT: v_mov_b32_e32 v9, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v2, v4
+; CGP-NEXT: v_mov_b32_e32 v10, v0
; CGP-NEXT: v_or_b32_e32 v1, v11, v5
; CGP-NEXT: v_mov_b32_e32 v0, 0
; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; CGP-NEXT: v_cvt_f32_u32_e32 v2, v4
+; CGP-NEXT: v_mov_b32_e32 v9, v3
; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; CGP-NEXT: s_cbranch_execz .LBB2_2
; CGP-NEXT: ; %bb.1:
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v5
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
; CGP-NEXT: v_mac_f32_e32 v2, 0x4f800000, v0
; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4
+; CGP-NEXT: v_subb_u32_e32 v3, vcc, 0, v5, vcc
; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CGP-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
-; CGP-NEXT: v_trunc_f32_e32 v2, v2
-; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; CGP-NEXT: v_trunc_f32_e32 v1, v1
+; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT: v_mul_lo_u32 v12, v1, v2
-; CGP-NEXT: v_mul_lo_u32 v13, v1, v0
-; CGP-NEXT: v_mul_hi_u32 v14, v1, v0
+; CGP-NEXT: v_mul_lo_u32 v12, v2, v1
+; CGP-NEXT: v_mul_hi_u32 v13, v2, v0
; CGP-NEXT: v_mul_lo_u32 v15, v3, v0
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
-; CGP-NEXT: v_mul_hi_u32 v14, v0, v13
-; CGP-NEXT: v_mul_lo_u32 v16, v2, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v2, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v2, v0
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT: v_mul_hi_u32 v13, v0, v14
; CGP-NEXT: v_mul_lo_u32 v15, v0, v12
-; CGP-NEXT: v_mul_hi_u32 v17, v0, v12
-; CGP-NEXT: v_mul_lo_u32 v18, v2, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v12
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_mul_lo_u32 v16, v1, v14
+; CGP-NEXT: v_mul_hi_u32 v14, v1, v14
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17
-; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v16, v0, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT: v_mul_lo_u32 v15, v1, v12
; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v18
; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v17, v16
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; CGP-NEXT: v_mul_hi_u32 v12, v1, v12
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v2, v12, vcc
-; CGP-NEXT: v_mul_lo_u32 v12, v1, v0
-; CGP-NEXT: v_mul_hi_u32 v13, v1, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_mul_hi_u32 v12, v2, v0
+; CGP-NEXT: v_mul_lo_u32 v13, v2, v1
; CGP-NEXT: v_mul_lo_u32 v3, v3, v0
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v2
-; CGP-NEXT: v_mul_hi_u32 v14, v0, v12
-; CGP-NEXT: v_mul_lo_u32 v15, v2, v12
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v12
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v3
-; CGP-NEXT: v_mul_lo_u32 v3, v0, v1
-; CGP-NEXT: v_mul_hi_u32 v13, v0, v1
-; CGP-NEXT: v_mul_lo_u32 v16, v2, v1
-; CGP-NEXT: v_mul_hi_u32 v1, v2, v1
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v2, v2, v0
; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v15
-; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v14, v3
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT: v_mul_hi_u32 v12, v0, v2
+; CGP-NEXT: v_mul_lo_u32 v13, v0, v3
+; CGP-NEXT: v_mul_lo_u32 v14, v1, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v1, v2
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v0, v3
; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; CGP-NEXT: v_mul_hi_u32 v2, v10, v0
-; CGP-NEXT: v_mul_lo_u32 v3, v11, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
-; CGP-NEXT: v_mul_lo_u32 v12, v10, v1
-; CGP-NEXT: v_mul_hi_u32 v13, v10, v1
-; CGP-NEXT: v_mul_lo_u32 v14, v11, v1
-; CGP-NEXT: v_mul_hi_u32 v1, v11, v1
+; CGP-NEXT: v_mul_lo_u32 v13, v1, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v14
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_mul_hi_u32 v3, v1, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v12
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
+; CGP-NEXT: v_mul_hi_u32 v2, v10, v0
+; CGP-NEXT: v_mul_lo_u32 v3, v10, v1
+; CGP-NEXT: v_mul_lo_u32 v12, v11, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v11, v0
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT: v_mul_hi_u32 v12, v10, v1
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
+; CGP-NEXT: v_mul_lo_u32 v3, v11, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3
; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v13, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v12, v3
+; CGP-NEXT: v_mul_hi_u32 v1, v11, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CGP-NEXT: v_mul_lo_u32 v3, v4, v0
-; CGP-NEXT: v_mul_hi_u32 v12, v4, v0
-; CGP-NEXT: v_mul_lo_u32 v0, v5, v0
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v4, v0
; CGP-NEXT: v_mul_lo_u32 v1, v4, v1
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v12, v1
+; CGP-NEXT: v_mul_lo_u32 v3, v4, v0
+; CGP-NEXT: v_mul_lo_u32 v0, v5, v0
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v10, v3
; CGP-NEXT: v_subb_u32_e64 v2, s[4:5], v11, v0, vcc
; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v11, v0
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5
+; CGP-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v4
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v5
; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5
-; CGP-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v10, vcc, v1, v4
; CGP-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v5
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v4
; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v5, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v11, v5
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v5
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v11, v5
-; CGP-NEXT: v_cndmask_b32_e32 v5, v13, v12, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
; CGP-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc
; CGP-NEXT: v_cndmask_b32_e32 v5, v11, v0, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
@@ -796,14 +796,14 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: s_cbranch_execz .LBB2_4
; CGP-NEXT: ; %bb.3:
; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v2
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, 0, v4
-; CGP-NEXT: v_mov_b32_e32 v1, 0
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v4
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT: v_mul_lo_u32 v2, v2, v0
-; CGP-NEXT: v_mul_hi_u32 v2, v0, v2
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CGP-NEXT: v_mul_lo_u32 v1, v1, v0
+; CGP-NEXT: v_mul_hi_u32 v1, v0, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CGP-NEXT: v_mul_hi_u32 v0, v10, v0
+; CGP-NEXT: v_mov_b32_e32 v1, 0
; CGP-NEXT: v_mul_lo_u32 v0, v0, v4
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v10, v0
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v0, v4
@@ -814,10 +814,10 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
; CGP-NEXT: .LBB2_4: ; %.split
; CGP-NEXT: s_or_b64 exec, exec, s[4:5]
+; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6
; CGP-NEXT: v_or_b32_e32 v3, v9, v7
; CGP-NEXT: v_mov_b32_e32 v2, 0
; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, v6
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -830,123 +830,123 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: s_setpc_b64 s[30:31]
; CGP-NEXT: .LBB2_7:
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v7
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6
-; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v2
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6
+; CGP-NEXT: v_subb_u32_e32 v5, vcc, 0, v7, vcc
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2
-; CGP-NEXT: v_trunc_f32_e32 v4, v4
-; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
+; CGP-NEXT: v_trunc_f32_e32 v3, v3
+; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT: v_mul_lo_u32 v10, v3, v4
-; CGP-NEXT: v_mul_lo_u32 v11, v3, v2
-; CGP-NEXT: v_mul_hi_u32 v12, v3, v2
+; CGP-NEXT: v_mul_lo_u32 v10, v4, v3
+; CGP-NEXT: v_mul_hi_u32 v11, v4, v2
; CGP-NEXT: v_mul_lo_u32 v13, v5, v2
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v11
-; CGP-NEXT: v_mul_lo_u32 v14, v4, v11
-; CGP-NEXT: v_mul_hi_u32 v11, v4, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v4, v2
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; CGP-NEXT: v_mul_hi_u32 v11, v2, v12
; CGP-NEXT: v_mul_lo_u32 v13, v2, v10
-; CGP-NEXT: v_mul_hi_u32 v15, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v16, v4, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v4, v10
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v3, v12
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v2, v10
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v13, v3, v10
; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_mul_hi_u32 v10, v3, v10
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT: v_addc_u32_e32 v4, vcc, v4, v10, vcc
-; CGP-NEXT: v_mul_lo_u32 v10, v3, v2
-; CGP-NEXT: v_mul_hi_u32 v11, v3, v2
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_mul_hi_u32 v10, v4, v2
+; CGP-NEXT: v_mul_lo_u32 v11, v4, v3
; CGP-NEXT: v_mul_lo_u32 v5, v5, v2
-; CGP-NEXT: v_mul_lo_u32 v3, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v13, v4, v10
-; CGP-NEXT: v_mul_hi_u32 v10, v4, v10
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT: v_mul_lo_u32 v5, v2, v3
-; CGP-NEXT: v_mul_hi_u32 v11, v2, v3
-; CGP-NEXT: v_mul_lo_u32 v14, v4, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v4, v3
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v12, v5
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v10, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v11, v2, v5
+; CGP-NEXT: v_mul_lo_u32 v12, v3, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v10, v12
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v12, v2, v5
; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v10
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; CGP-NEXT: v_mul_hi_u32 v4, v8, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v9, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v9, v2
-; CGP-NEXT: v_mul_lo_u32 v10, v8, v3
-; CGP-NEXT: v_mul_hi_u32 v11, v8, v3
-; CGP-NEXT: v_mul_lo_u32 v12, v9, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v9, v3
+; CGP-NEXT: v_mul_lo_u32 v11, v3, v5
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_mul_hi_u32 v5, v3, v5
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v10
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; CGP-NEXT: v_mul_hi_u32 v4, v8, v2
+; CGP-NEXT: v_mul_lo_u32 v5, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v10, v9, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v9, v2
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v10
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_mul_hi_u32 v10, v8, v3
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT: v_mul_lo_u32 v5, v9, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v11, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; CGP-NEXT: v_mul_hi_u32 v3, v9, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v5, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v10, v6, v2
-; CGP-NEXT: v_mul_lo_u32 v2, v7, v2
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v6, v2
; CGP-NEXT: v_mul_lo_u32 v3, v6, v3
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v10, v3
+; CGP-NEXT: v_mul_lo_u32 v5, v6, v2
+; CGP-NEXT: v_mul_lo_u32 v2, v7, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_sub_i32_e32 v3, vcc, v8, v5
; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v2, vcc
; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v9, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v7
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v6
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v7
; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v7
-; CGP-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v8, vcc, v3, v6
; CGP-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v2, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v7
; CGP-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v6
; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v7, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v9, v7
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v7
; CGP-NEXT: v_sub_i32_e32 v6, vcc, v8, v6
+; CGP-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v9, v7
-; CGP-NEXT: v_cndmask_b32_e32 v7, v11, v10, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; CGP-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
; CGP-NEXT: v_cndmask_b32_e32 v7, v9, v2, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
@@ -959,14 +959,14 @@ define <2 x i64> @v_urem_v2i64(<2 x i64> %num, <2 x i64> %den) {
; CGP-NEXT: s_cbranch_execz .LBB2_6
; CGP-NEXT: .LBB2_8:
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v6
-; CGP-NEXT: v_mov_b32_e32 v3, 0
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v6
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
-; CGP-NEXT: v_mul_hi_u32 v4, v2, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_mul_hi_u32 v2, v8, v2
+; CGP-NEXT: v_mov_b32_e32 v3, 0
; CGP-NEXT: v_mul_lo_u32 v2, v2, v6
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v6
@@ -1010,36 +1010,36 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v2, 0x1fb03c31
-; CHECK-NEXT: v_mov_b32_e32 v3, 0xd9528440
-; CHECK-NEXT: v_mov_b32_e32 v4, 0xffed2705
-; CHECK-NEXT: v_mul_hi_u32 v5, v0, v2
-; CHECK-NEXT: v_mul_lo_u32 v6, v0, v3
-; CHECK-NEXT: v_mul_lo_u32 v7, v1, v2
+; CHECK-NEXT: v_mov_b32_e32 v4, 0xd9528440
+; CHECK-NEXT: v_mul_hi_u32 v3, v0, v2
+; CHECK-NEXT: v_mul_lo_u32 v5, v0, v4
+; CHECK-NEXT: v_mul_lo_u32 v6, v1, v2
; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v0, v3
-; CHECK-NEXT: v_mul_lo_u32 v9, v1, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v1, v3
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v6, vcc, v8, v7
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v6, v0, v4
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_mul_lo_u32 v5, v1, v4
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v5
; CHECK-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT: v_mul_hi_u32 v4, v1, v4
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CHECK-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CHECK-NEXT: v_lshr_b64 v[2:3], v[2:3], 20
-; CHECK-NEXT: v_mul_lo_u32 v5, v2, v4
-; CHECK-NEXT: v_mul_hi_u32 v6, v2, v4
+; CHECK-NEXT: v_mov_b32_e32 v4, 0xffed2705
+; CHECK-NEXT: v_mul_hi_u32 v5, v2, v4
; CHECK-NEXT: v_mul_lo_u32 v3, v3, v4
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v6, v2
+; CHECK-NEXT: v_mul_lo_u32 v6, v2, v4
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v2, vcc
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = urem i64 %num, 1235195
@@ -1050,131 +1050,131 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
; GISEL-LABEL: v_urem_v2i64_oddk_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_mov_b32_e32 v4, 0x1fb03c31
-; GISEL-NEXT: v_mov_b32_e32 v5, 0xd9528440
-; GISEL-NEXT: v_mov_b32_e32 v8, 0x12d8fb
-; GISEL-NEXT: v_mul_hi_u32 v6, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v7, v0, v5
-; GISEL-NEXT: v_mul_lo_u32 v9, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v10, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v11, v0, v5
-; GISEL-NEXT: v_mul_lo_u32 v12, v1, v5
-; GISEL-NEXT: v_mul_hi_u32 v13, v1, v5
-; GISEL-NEXT: v_mul_hi_u32 v14, v2, v4
-; GISEL-NEXT: v_mul_lo_u32 v15, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v16, v3, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5
-; GISEL-NEXT: v_mul_lo_u32 v18, v3, v5
-; GISEL-NEXT: v_mul_hi_u32 v19, v3, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v14, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_mov_b32_e32 v6, 0x1fb03c31
+; GISEL-NEXT: v_mov_b32_e32 v7, 0xd9528440
+; GISEL-NEXT: v_mul_hi_u32 v4, v0, v6
+; GISEL-NEXT: v_mul_lo_u32 v5, v0, v7
+; GISEL-NEXT: v_mul_lo_u32 v8, v1, v6
+; GISEL-NEXT: v_mul_hi_u32 v9, v0, v7
+; GISEL-NEXT: v_mul_lo_u32 v10, v2, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v4, v18
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v11
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v8, v1, v6
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v1, v7
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v19, v7
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_mul_hi_u32 v8, v1, v7
+; GISEL-NEXT: v_mul_hi_u32 v9, v2, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v3, v6
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT: v_mul_hi_u32 v9, v2, v7
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT: v_mul_lo_u32 v10, v3, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_lshr_b64 v[4:5], v[4:5], 20
; GISEL-NEXT: v_lshr_b64 v[6:7], v[6:7], 20
+; GISEL-NEXT: v_mov_b32_e32 v8, 0x12d8fb
; GISEL-NEXT: v_mul_lo_u32 v9, v4, v8
; GISEL-NEXT: v_mul_hi_u32 v4, v4, v8
; GISEL-NEXT: v_mul_lo_u32 v5, v5, v8
-; GISEL-NEXT: v_mul_lo_u32 v10, v6, v8
-; GISEL-NEXT: v_mul_hi_u32 v6, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v10, v6, v8
; GISEL-NEXT: v_mul_lo_u32 v7, v7, v8
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; GISEL-NEXT: v_mul_lo_u32 v5, v6, v8
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v7
; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v9
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v6, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_urem_v2i64_oddk_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_mov_b32_e32 v4, 0x1fb03c31
-; CGP-NEXT: v_mov_b32_e32 v5, 0xd9528440
+; CGP-NEXT: v_mov_b32_e32 v6, 0x1fb03c31
+; CGP-NEXT: v_mov_b32_e32 v7, 0xd9528440
+; CGP-NEXT: v_mul_hi_u32 v4, v0, v6
+; CGP-NEXT: v_mul_lo_u32 v5, v0, v7
+; CGP-NEXT: v_mul_lo_u32 v8, v1, v6
+; CGP-NEXT: v_mul_hi_u32 v9, v0, v7
+; CGP-NEXT: v_mul_lo_u32 v11, v2, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v8, v1, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT: v_mul_lo_u32 v5, v1, v7
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT: v_mul_hi_u32 v9, v1, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; CGP-NEXT: v_lshr_b64 v[4:5], v[4:5], 20
; CGP-NEXT: v_mov_b32_e32 v8, 0xffed2705
-; CGP-NEXT: v_mul_hi_u32 v6, v0, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v0, v5
-; CGP-NEXT: v_mul_lo_u32 v9, v1, v4
-; CGP-NEXT: v_mul_hi_u32 v10, v1, v4
-; CGP-NEXT: v_mul_hi_u32 v11, v0, v5
-; CGP-NEXT: v_mul_lo_u32 v12, v1, v5
-; CGP-NEXT: v_mul_hi_u32 v13, v1, v5
-; CGP-NEXT: v_mul_hi_u32 v14, v2, v4
-; CGP-NEXT: v_mul_lo_u32 v15, v2, v5
-; CGP-NEXT: v_mul_lo_u32 v16, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v17, v2, v5
-; CGP-NEXT: v_mul_lo_u32 v18, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v19, v3, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
+; CGP-NEXT: v_mul_hi_u32 v9, v4, v8
+; CGP-NEXT: v_mul_lo_u32 v5, v5, v8
+; CGP-NEXT: v_mul_lo_u32 v10, v4, v8
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_mul_hi_u32 v9, v2, v6
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v4, v5
+; CGP-NEXT: v_mul_lo_u32 v4, v3, v6
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v11
+; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v5, v3, v6
+; CGP-NEXT: v_mul_hi_u32 v6, v2, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v9, v4
+; CGP-NEXT: v_mul_lo_u32 v9, v3, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v11
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v14, v15
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v17
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v4, v18
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v14, v11
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v5
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT: v_mul_hi_u32 v7, v3, v7
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v5, v4
; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v19, v7
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
; CGP-NEXT: v_lshr_b64 v[4:5], v[4:5], 20
-; CGP-NEXT: v_lshr_b64 v[6:7], v[6:7], 20
-; CGP-NEXT: v_mul_lo_u32 v9, v4, v8
-; CGP-NEXT: v_mul_hi_u32 v10, v4, v8
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; CGP-NEXT: v_mul_hi_u32 v6, v4, v8
; CGP-NEXT: v_mul_lo_u32 v5, v5, v8
-; CGP-NEXT: v_mul_lo_u32 v11, v6, v8
-; CGP-NEXT: v_mul_hi_u32 v12, v6, v8
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v8
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_sub_i32_e32 v6, vcc, v12, v6
+; CGP-NEXT: v_mul_lo_u32 v7, v4, v8
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, v6, v4
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v4, vcc
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = urem <2 x i64> %num, <i64 1235195, i64 1235195>
ret <2 x i64> %result
@@ -1188,11 +1188,11 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: v_mov_b32_e32 v4, v1
; CHECK-NEXT: v_mov_b32_e32 v0, 0x1000
; CHECK-NEXT: v_mov_b32_e32 v1, 0
-; CHECK-NEXT: v_mov_b32_e32 v7, 0
; CHECK-NEXT: v_lshl_b64 v[5:6], v[0:1], v2
-; CHECK-NEXT: v_or_b32_e32 v8, v4, v6
-; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
+; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: v_cvt_f32_u32_e32 v2, v5
+; CHECK-NEXT: v_or_b32_e32 v1, v4, v6
+; CHECK-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; CHECK-NEXT: ; implicit-def: $vgpr0_vgpr1
; CHECK-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -1205,123 +1205,123 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: s_setpc_b64 s[30:31]
; CHECK-NEXT: .LBB7_3:
; CHECK-NEXT: v_cvt_f32_u32_e32 v0, v6
-; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5
-; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc
; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v0
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
+; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5
+; CHECK-NEXT: v_subb_u32_e32 v7, vcc, 0, v6, vcc
; CHECK-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CHECK-NEXT: v_mul_f32_e32 v2, 0x2f800000, v0
-; CHECK-NEXT: v_trunc_f32_e32 v2, v2
-; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
+; CHECK-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; CHECK-NEXT: v_trunc_f32_e32 v1, v1
+; CHECK-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; CHECK-NEXT: v_cvt_u32_f32_e32 v1, v1
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT: v_mul_lo_u32 v8, v1, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v1, v0
-; CHECK-NEXT: v_mul_hi_u32 v10, v1, v0
+; CHECK-NEXT: v_mul_lo_u32 v8, v2, v1
+; CHECK-NEXT: v_mul_hi_u32 v9, v2, v0
; CHECK-NEXT: v_mul_lo_u32 v11, v7, v0
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_mul_hi_u32 v10, v0, v9
-; CHECK-NEXT: v_mul_lo_u32 v12, v2, v9
-; CHECK-NEXT: v_mul_hi_u32 v9, v2, v9
+; CHECK-NEXT: v_mul_lo_u32 v10, v2, v0
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CHECK-NEXT: v_mul_hi_u32 v9, v0, v10
; CHECK-NEXT: v_mul_lo_u32 v11, v0, v8
-; CHECK-NEXT: v_mul_hi_u32 v13, v0, v8
-; CHECK-NEXT: v_mul_lo_u32 v14, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v2, v8
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT: v_mul_lo_u32 v12, v1, v10
+; CHECK-NEXT: v_mul_hi_u32 v10, v1, v10
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; CHECK-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_mul_hi_u32 v12, v0, v8
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; CHECK-NEXT: v_mul_lo_u32 v11, v1, v8
; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v14
; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CHECK-NEXT: v_add_i32_e32 v11, vcc, v13, v12
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; CHECK-NEXT: v_add_i32_e32 v10, vcc, v10, v11
+; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CHECK-NEXT: v_mul_hi_u32 v8, v1, v8
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v10, vcc, v11, v10
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; CHECK-NEXT: v_addc_u32_e32 v2, vcc, v2, v8, vcc
-; CHECK-NEXT: v_mul_lo_u32 v8, v1, v0
-; CHECK-NEXT: v_mul_hi_u32 v9, v1, v0
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v8, vcc
+; CHECK-NEXT: v_mul_hi_u32 v8, v2, v0
+; CHECK-NEXT: v_mul_lo_u32 v9, v2, v1
; CHECK-NEXT: v_mul_lo_u32 v7, v7, v0
-; CHECK-NEXT: v_mul_lo_u32 v1, v1, v2
-; CHECK-NEXT: v_mul_hi_u32 v10, v0, v8
-; CHECK-NEXT: v_mul_lo_u32 v11, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v8, v2, v8
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v9, v1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v7
-; CHECK-NEXT: v_mul_lo_u32 v7, v0, v1
-; CHECK-NEXT: v_mul_hi_u32 v9, v0, v1
-; CHECK-NEXT: v_mul_lo_u32 v12, v2, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v2, v1
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_mul_lo_u32 v2, v2, v0
; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; CHECK-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v10, v7
-; CHECK-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v8, v0, v2
+; CHECK-NEXT: v_mul_lo_u32 v9, v0, v7
+; CHECK-NEXT: v_mul_lo_u32 v10, v1, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
-; CHECK-NEXT: v_mul_hi_u32 v2, v3, v0
-; CHECK-NEXT: v_mul_lo_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
-; CHECK-NEXT: v_mul_lo_u32 v8, v3, v1
-; CHECK-NEXT: v_mul_hi_u32 v9, v3, v1
-; CHECK-NEXT: v_mul_lo_u32 v10, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v9
+; CHECK-NEXT: v_mul_hi_u32 v10, v0, v7
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_mul_lo_u32 v9, v1, v7
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v9
; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT: v_mul_hi_u32 v7, v1, v7
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT: v_mul_hi_u32 v2, v3, v0
+; CHECK-NEXT: v_mul_lo_u32 v7, v3, v1
+; CHECK-NEXT: v_mul_lo_u32 v8, v4, v0
+; CHECK-NEXT: v_mul_hi_u32 v0, v4, v0
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v10
+; CHECK-NEXT: v_mul_hi_u32 v8, v3, v1
+; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT: v_mul_lo_u32 v7, v4, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v7
; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT: v_mul_hi_u32 v1, v4, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
; CHECK-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v7, v5, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v0
-; CHECK-NEXT: v_mul_lo_u32 v0, v6, v0
; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT: v_mul_hi_u32 v2, v5, v0
; CHECK-NEXT: v_mul_lo_u32 v1, v5, v1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v8, v1
+; CHECK-NEXT: v_mul_lo_u32 v7, v5, v0
+; CHECK-NEXT: v_mul_lo_u32 v0, v6, v0
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CHECK-NEXT: v_sub_i32_e32 v1, vcc, v3, v7
; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v4, v0, vcc
; CHECK-NEXT: v_sub_i32_e64 v0, s[4:5], v4, v0
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v6
+; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v5
; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v2, v6
; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v0, v6, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v2, v6
-; CHECK-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5]
; CHECK-NEXT: v_sub_i32_e32 v4, vcc, v1, v5
; CHECK-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v0, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v5
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v6
; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v5
; CHECK-NEXT: v_subb_u32_e32 v0, vcc, v0, v6, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v7, v6
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v6
; CHECK-NEXT: v_sub_i32_e32 v5, vcc, v4, v5
+; CHECK-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CHECK-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v7, v6
-; CHECK-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
; CHECK-NEXT: v_cndmask_b32_e32 v5, v7, v0, vcc
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
@@ -1334,14 +1334,14 @@ define i64 @v_urem_i64_pow2_shl_denom(i64 %x, i64 %y) {
; CHECK-NEXT: s_cbranch_execz .LBB7_2
; CHECK-NEXT: .LBB7_4:
; CHECK-NEXT: v_rcp_iflag_f32_e32 v0, v2
-; CHECK-NEXT: v_sub_i32_e32 v2, vcc, 0, v5
-; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: v_sub_i32_e32 v1, vcc, 0, v5
; CHECK-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; CHECK-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CHECK-NEXT: v_mul_lo_u32 v2, v2, v0
-; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_mul_lo_u32 v1, v1, v0
+; CHECK-NEXT: v_mul_hi_u32 v1, v0, v1
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CHECK-NEXT: v_mul_hi_u32 v0, v3, v0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
; CHECK-NEXT: v_mul_lo_u32 v0, v0, v5
; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v3, v0
; CHECK-NEXT: v_sub_i32_e32 v2, vcc, v0, v5
@@ -1364,105 +1364,107 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_mov_b32_e32 v9, 0x1000
; GISEL-NEXT: v_mov_b32_e32 v10, 0
; GISEL-NEXT: v_lshl_b64 v[7:8], v[9:10], v4
-; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], v6
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v7
-; GISEL-NEXT: v_cvt_f32_u32_e32 v9, v8
-; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v9
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT: v_mul_f32_e32 v9, 0x2f800000, v6
-; GISEL-NEXT: v_trunc_f32_e32 v9, v9
-; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v9
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v9
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v7
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, v10, v6
-; GISEL-NEXT: v_mul_lo_u32 v14, v10, v9
-; GISEL-NEXT: v_mul_lo_u32 v15, v11, v6
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v7
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v8
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v7
+; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v8, vcc
+; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT: v_mul_lo_u32 v14, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v13, v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v16, v12, v4
+; GISEL-NEXT: v_mul_lo_u32 v15, v11, v4
; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
-; GISEL-NEXT: v_mul_hi_u32 v14, v6, v12
-; GISEL-NEXT: v_mul_lo_u32 v15, v6, v13
-; GISEL-NEXT: v_mul_lo_u32 v16, v9, v12
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT: v_mul_hi_u32 v14, v4, v15
+; GISEL-NEXT: v_mul_lo_u32 v16, v4, v13
+; GISEL-NEXT: v_mul_lo_u32 v17, v5, v15
+; GISEL-NEXT: v_mul_hi_u32 v15, v5, v15
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v17
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v17, v4, v13
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; GISEL-NEXT: v_mul_lo_u32 v16, v5, v13
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v17, v16
+; GISEL-NEXT: v_mul_hi_u32 v13, v5, v13
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT: v_mul_hi_u32 v15, v6, v13
-; GISEL-NEXT: v_mul_lo_u32 v16, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v13, vcc
+; GISEL-NEXT: v_mul_hi_u32 v13, v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v14, v11, v5
+; GISEL-NEXT: v_mul_lo_u32 v12, v12, v4
+; GISEL-NEXT: v_mul_lo_u32 v11, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v13, v4, v11
+; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12
+; GISEL-NEXT: v_mul_lo_u32 v15, v5, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v15, v4, v12
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; GISEL-NEXT: v_mul_lo_u32 v14, v5, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v13
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v12, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, v10, v6
-; GISEL-NEXT: v_mul_lo_u32 v10, v10, v9
-; GISEL-NEXT: v_mul_lo_u32 v11, v11, v6
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_mul_hi_u32 v11, v6, v12
-; GISEL-NEXT: v_mul_lo_u32 v13, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v5, v12
; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v13, v11
-; GISEL-NEXT: v_mul_hi_u32 v12, v9, v12
-; GISEL-NEXT: v_mul_hi_u32 v13, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v14, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, v14, v13
; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v10, v9, v10
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, v9, v10, vcc
-; GISEL-NEXT: v_mul_hi_u32 v10, v0, v6
-; GISEL-NEXT: v_mul_lo_u32 v11, v0, v9
-; GISEL-NEXT: v_mul_lo_u32 v12, v1, v6
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v4, v11
+; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v5, v12, vcc
+; GISEL-NEXT: v_mul_hi_u32 v13, v0, v11
+; GISEL-NEXT: v_mul_lo_u32 v14, v0, v12
+; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v1, v11
+; GISEL-NEXT: v_cvt_f32_u32_e32 v15, v5
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v14
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v6, v1, v6
-; GISEL-NEXT: v_mul_hi_u32 v11, v0, v9
-; GISEL-NEXT: v_mul_lo_u32 v12, v1, v9
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v11
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v1, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v0, v12
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6
+; GISEL-NEXT: v_mul_lo_u32 v10, v1, v12
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v9, v1, v9
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT: v_mul_lo_u32 v10, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v11, v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v11, v1, v12
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_mul_hi_u32 v10, v7, v6
; GISEL-NEXT: v_mul_lo_u32 v9, v7, v9
+; GISEL-NEXT: v_mul_lo_u32 v11, v7, v6
; GISEL-NEXT: v_mul_lo_u32 v6, v8, v6
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT: v_cvt_f32_u32_e32 v14, v4
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v11
; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v1, v6, vcc
; GISEL-NEXT: v_sub_i32_e64 v1, s[4:5], v1, v6
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v8
@@ -1470,123 +1472,121 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v7
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v8
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v10, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], v0, v7
+; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT: v_subbrev_u32_e64 v11, s[4:5], 0, v1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v7
+; GISEL-NEXT: v_mac_f32_e32 v14, 0x4f800000, v15
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v11, v8
; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v11, vcc, 0, v1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v8
-; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v13, vcc
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v8, v14
; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v10, v7
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v1, v8, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, v13, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT: v_mul_f32_e32 v8, 0x5f7ffffc, v8
; GISEL-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc
+; GISEL-NEXT: v_mul_f32_e32 v10, 0x2f800000, v8
+; GISEL-NEXT: v_trunc_f32_e32 v10, v10
+; GISEL-NEXT: v_mac_f32_e32 v8, 0xcf800000, v10
+; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v10
+; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], 0, v4
+; GISEL-NEXT: v_subb_u32_e64 v13, s[4:5], 0, v5, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v14, v12, v8
+; GISEL-NEXT: v_mul_lo_u32 v15, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v17, v13, v8
+; GISEL-NEXT: v_mul_lo_u32 v16, v12, v8
; GISEL-NEXT: v_cndmask_b32_e32 v1, v11, v1, vcc
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v15
+; GISEL-NEXT: v_add_i32_e64 v14, s[4:5], v14, v17
+; GISEL-NEXT: v_mul_hi_u32 v15, v8, v16
+; GISEL-NEXT: v_mul_lo_u32 v17, v8, v14
+; GISEL-NEXT: v_mul_lo_u32 v11, v10, v16
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v15, v10, v16
+; GISEL-NEXT: v_mul_hi_u32 v16, v8, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v17, v11
+; GISEL-NEXT: v_mul_lo_u32 v17, v10, v14
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v17
+; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, v16, v17
+; GISEL-NEXT: v_mul_hi_u32 v14, v10, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v15, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, v16, v15
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, v10, v14, vcc
+; GISEL-NEXT: v_mul_hi_u32 v11, v12, v8
+; GISEL-NEXT: v_mul_lo_u32 v14, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v13, v13, v8
+; GISEL-NEXT: v_mul_lo_u32 v12, v12, v8
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v8, v12
+; GISEL-NEXT: v_mul_lo_u32 v14, v8, v11
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v10, v12
; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v13, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v11
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v13, v6
+; GISEL-NEXT: v_mul_lo_u32 v13, v10, v11
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v12, v13
+; GISEL-NEXT: v_mul_hi_u32 v11, v10, v11
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v8, v6
+; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], v10, v7, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v8, v2, v6
+; GISEL-NEXT: v_mul_lo_u32 v10, v2, v7
; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v6, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v6, 0x4f800000, v7
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v6
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v6
-; GISEL-NEXT: v_trunc_f32_e32 v7, v7
-; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, 0, v4
-; GISEL-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v6
-; GISEL-NEXT: v_mul_lo_u32 v12, v8, v7
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v12, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v13, v6, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v7, v10
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT: v_mul_hi_u32 v13, v6, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v7, v11
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT: v_mul_hi_u32 v11, v7, v11
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; GISEL-NEXT: v_mul_lo_u32 v10, v8, v6
-; GISEL-NEXT: v_mul_hi_u32 v11, v8, v6
-; GISEL-NEXT: v_mul_lo_u32 v8, v8, v7
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_mul_hi_u32 v9, v6, v10
-; GISEL-NEXT: v_mul_lo_u32 v11, v6, v8
-; GISEL-NEXT: v_mul_lo_u32 v12, v7, v10
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT: v_mul_hi_u32 v10, v7, v10
-; GISEL-NEXT: v_mul_hi_u32 v11, v6, v8
-; GISEL-NEXT: v_mul_lo_u32 v12, v7, v8
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, v7, v8, vcc
-; GISEL-NEXT: v_mul_hi_u32 v8, v2, v6
-; GISEL-NEXT: v_mul_lo_u32 v9, v2, v7
-; GISEL-NEXT: v_mul_lo_u32 v10, v3, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v10
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v6, v3, v6
; GISEL-NEXT: v_mul_hi_u32 v9, v2, v7
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v10, v8
; GISEL-NEXT: v_mul_lo_u32 v10, v3, v7
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v3, v7
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6
-; GISEL-NEXT: v_mul_hi_u32 v9, v4, v6
+; GISEL-NEXT: v_mul_hi_u32 v8, v4, v6
; GISEL-NEXT: v_mul_lo_u32 v7, v4, v7
+; GISEL-NEXT: v_mul_lo_u32 v9, v4, v6
; GISEL-NEXT: v_mul_lo_u32 v6, v5, v6
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9
; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v6, vcc
; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v3, v6
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v5
@@ -1594,18 +1594,18 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v5
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v4
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v2, v4
+; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v4
; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v5, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v9, vcc, 0, v3, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v5
; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], v3, v5, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
@@ -1618,140 +1618,140 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-LABEL: v_urem_v2i64_pow2_shl_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_mov_b32_e32 v8, v0
-; CGP-NEXT: v_mov_b32_e32 v9, v1
-; CGP-NEXT: v_mov_b32_e32 v5, v2
-; CGP-NEXT: v_mov_b32_e32 v7, v3
; CGP-NEXT: v_mov_b32_e32 v10, 0x1000
; CGP-NEXT: v_mov_b32_e32 v11, 0
-; CGP-NEXT: v_mov_b32_e32 v0, 0
+; CGP-NEXT: v_mov_b32_e32 v5, v2
+; CGP-NEXT: v_mov_b32_e32 v7, v3
; CGP-NEXT: v_lshl_b64 v[2:3], v[10:11], v4
+; CGP-NEXT: v_mov_b32_e32 v9, v1
+; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
+; CGP-NEXT: v_mov_b32_e32 v8, v0
; CGP-NEXT: v_or_b32_e32 v1, v9, v3
+; CGP-NEXT: v_mov_b32_e32 v0, 0
; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
; CGP-NEXT: ; implicit-def: $vgpr0_vgpr1
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
; CGP-NEXT: s_cbranch_execz .LBB8_2
; CGP-NEXT: ; %bb.1:
; CGP-NEXT: v_cvt_f32_u32_e32 v0, v3
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
-; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v0
; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v2
+; CGP-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
; CGP-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0
-; CGP-NEXT: v_trunc_f32_e32 v4, v4
-; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_mul_f32_e32 v1, 0x2f800000, v0
+; CGP-NEXT: v_trunc_f32_e32 v1, v1
+; CGP-NEXT: v_mac_f32_e32 v0, 0xcf800000, v1
+; CGP-NEXT: v_cvt_u32_f32_e32 v1, v1
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT: v_mul_lo_u32 v13, v1, v4
-; CGP-NEXT: v_mul_lo_u32 v14, v1, v0
-; CGP-NEXT: v_mul_hi_u32 v15, v1, v0
+; CGP-NEXT: v_mul_lo_u32 v13, v4, v1
+; CGP-NEXT: v_mul_hi_u32 v14, v4, v0
; CGP-NEXT: v_mul_lo_u32 v16, v12, v0
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; CGP-NEXT: v_mul_hi_u32 v15, v0, v14
-; CGP-NEXT: v_mul_lo_u32 v17, v4, v14
-; CGP-NEXT: v_mul_hi_u32 v14, v4, v14
+; CGP-NEXT: v_mul_lo_u32 v15, v4, v0
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT: v_mul_hi_u32 v14, v0, v15
; CGP-NEXT: v_mul_lo_u32 v16, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v18, v0, v13
-; CGP-NEXT: v_mul_lo_u32 v19, v4, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v4, v13
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; CGP-NEXT: v_mul_lo_u32 v17, v1, v15
+; CGP-NEXT: v_mul_hi_u32 v15, v1, v15
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v18
-; CGP-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v17
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v17, v0, v13
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT: v_mul_lo_u32 v16, v1, v13
; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v19
; CGP-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v18, v17
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v15
+; CGP-NEXT: v_add_i32_e32 v15, vcc, v15, v16
+; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v16, vcc, v17, v16
+; CGP-NEXT: v_mul_hi_u32 v13, v1, v13
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v15, vcc, v16, v15
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT: v_addc_u32_e32 v4, vcc, v4, v13, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v1, v0
-; CGP-NEXT: v_mul_hi_u32 v14, v1, v0
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v13, vcc
+; CGP-NEXT: v_mul_hi_u32 v13, v4, v0
+; CGP-NEXT: v_mul_lo_u32 v14, v4, v1
; CGP-NEXT: v_mul_lo_u32 v12, v12, v0
-; CGP-NEXT: v_mul_lo_u32 v1, v1, v4
-; CGP-NEXT: v_mul_hi_u32 v15, v0, v13
-; CGP-NEXT: v_mul_lo_u32 v16, v4, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v4, v13
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v14, v1
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v12
-; CGP-NEXT: v_mul_lo_u32 v12, v0, v1
-; CGP-NEXT: v_mul_hi_u32 v14, v0, v1
-; CGP-NEXT: v_mul_lo_u32 v17, v4, v1
-; CGP-NEXT: v_mul_hi_u32 v1, v4, v1
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v0
; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v16
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v17
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v15, v12
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v14, v16
; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v13, v0, v4
+; CGP-NEXT: v_mul_lo_u32 v14, v0, v12
+; CGP-NEXT: v_mul_lo_u32 v15, v1, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v1, v4
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v15
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v15, v0, v12
; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc
-; CGP-NEXT: v_mul_hi_u32 v4, v8, v0
-; CGP-NEXT: v_mul_lo_u32 v12, v9, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT: v_mul_lo_u32 v13, v8, v1
-; CGP-NEXT: v_mul_hi_u32 v14, v8, v1
-; CGP-NEXT: v_mul_lo_u32 v15, v9, v1
-; CGP-NEXT: v_mul_hi_u32 v1, v9, v1
+; CGP-NEXT: v_mul_lo_u32 v14, v1, v12
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v15
+; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v14
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT: v_mul_hi_u32 v12, v1, v12
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v12, vcc
+; CGP-NEXT: v_mul_hi_u32 v4, v8, v0
+; CGP-NEXT: v_mul_lo_u32 v12, v8, v1
+; CGP-NEXT: v_mul_lo_u32 v13, v9, v0
+; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v13
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v15
+; CGP-NEXT: v_mul_hi_u32 v13, v8, v1
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
+; CGP-NEXT: v_mul_lo_u32 v12, v9, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v12
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v14, v12
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; CGP-NEXT: v_mul_hi_u32 v1, v9, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v12, v4
-; CGP-NEXT: v_mul_lo_u32 v12, v2, v0
-; CGP-NEXT: v_mul_hi_u32 v13, v2, v0
-; CGP-NEXT: v_mul_lo_u32 v0, v3, v0
; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v2, v0
; CGP-NEXT: v_mul_lo_u32 v1, v2, v1
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
+; CGP-NEXT: v_mul_lo_u32 v12, v2, v0
+; CGP-NEXT: v_mul_lo_u32 v0, v3, v0
+; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; CGP-NEXT: v_sub_i32_e32 v1, vcc, v8, v12
; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v9, v0, vcc
; CGP-NEXT: v_sub_i32_e64 v0, s[4:5], v9, v0
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2
-; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v3
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v1, v2
; CGP-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v3
; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; CGP-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v9, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v9, vcc, v1, v2
; CGP-NEXT: v_subbrev_u32_e64 v12, s[4:5], 0, v0, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v12, v3
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v2
; CGP-NEXT: v_subb_u32_e32 v0, vcc, v0, v3, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v12, v3
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v12, v3
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v9, v2
+; CGP-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v12, v3
-; CGP-NEXT: v_cndmask_b32_e32 v3, v14, v13, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v13
; CGP-NEXT: v_cndmask_b32_e32 v2, v9, v2, vcc
; CGP-NEXT: v_cndmask_b32_e32 v3, v12, v0, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
@@ -1767,14 +1767,14 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: s_cbranch_execz .LBB8_4
; CGP-NEXT: ; %bb.3:
; CGP-NEXT: v_rcp_iflag_f32_e32 v0, v4
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v2
-; CGP-NEXT: v_mov_b32_e32 v1, 0
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, 0, v2
; CGP-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
; CGP-NEXT: v_cvt_u32_f32_e32 v0, v0
-; CGP-NEXT: v_mul_lo_u32 v3, v3, v0
-; CGP-NEXT: v_mul_hi_u32 v3, v0, v3
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT: v_mul_lo_u32 v1, v1, v0
+; CGP-NEXT: v_mul_hi_u32 v1, v0, v1
+; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; CGP-NEXT: v_mul_hi_u32 v0, v8, v0
+; CGP-NEXT: v_mov_b32_e32 v1, 0
; CGP-NEXT: v_mul_lo_u32 v0, v0, v2
; CGP-NEXT: v_sub_i32_e32 v0, vcc, v8, v0
; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v2
@@ -1785,10 +1785,10 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; CGP-NEXT: .LBB8_4: ; %.split
; CGP-NEXT: s_or_b64 exec, exec, s[4:5]
+; CGP-NEXT: v_cvt_f32_u32_e32 v4, v9
; CGP-NEXT: v_or_b32_e32 v3, v7, v10
; CGP-NEXT: v_mov_b32_e32 v2, 0
; CGP-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, v9
; CGP-NEXT: ; implicit-def: $vgpr2_vgpr3
; CGP-NEXT: s_and_saveexec_b64 s[4:5], vcc
; CGP-NEXT: s_xor_b64 s[6:7], exec, s[4:5]
@@ -1801,122 +1801,122 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: s_setpc_b64 s[30:31]
; CGP-NEXT: .LBB8_7:
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v10
-; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9
-; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc
; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v2
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4
+; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9
+; CGP-NEXT: v_subb_u32_e32 v6, vcc, 0, v10, vcc
; CGP-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CGP-NEXT: v_mul_f32_e32 v4, 0x2f800000, v2
-; CGP-NEXT: v_trunc_f32_e32 v4, v4
-; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
+; CGP-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
+; CGP-NEXT: v_trunc_f32_e32 v3, v3
+; CGP-NEXT: v_mac_f32_e32 v2, 0xcf800000, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT: v_mul_lo_u32 v8, v3, v4
-; CGP-NEXT: v_mul_lo_u32 v11, v3, v2
-; CGP-NEXT: v_mul_hi_u32 v12, v3, v2
+; CGP-NEXT: v_mul_lo_u32 v8, v4, v3
+; CGP-NEXT: v_mul_hi_u32 v11, v4, v2
; CGP-NEXT: v_mul_lo_u32 v13, v6, v2
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v12, v8
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v11
-; CGP-NEXT: v_mul_lo_u32 v14, v4, v11
-; CGP-NEXT: v_mul_hi_u32 v11, v4, v11
+; CGP-NEXT: v_mul_lo_u32 v12, v4, v2
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v13
+; CGP-NEXT: v_mul_hi_u32 v11, v2, v12
; CGP-NEXT: v_mul_lo_u32 v13, v2, v8
-; CGP-NEXT: v_mul_hi_u32 v15, v2, v8
-; CGP-NEXT: v_mul_lo_u32 v16, v4, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v4, v8
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_mul_lo_u32 v14, v3, v12
+; CGP-NEXT: v_mul_hi_u32 v12, v3, v12
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v15
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v14, v2, v8
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT: v_mul_lo_u32 v13, v3, v8
; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v14
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v16
; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT: v_add_i32_e32 v12, vcc, v12, v13
+; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT: v_mul_hi_u32 v8, v3, v8
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v12, vcc, v13, v12
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT: v_addc_u32_e32 v4, vcc, v4, v8, vcc
-; CGP-NEXT: v_mul_lo_u32 v8, v3, v2
-; CGP-NEXT: v_mul_hi_u32 v11, v3, v2
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v8, vcc
+; CGP-NEXT: v_mul_hi_u32 v8, v4, v2
+; CGP-NEXT: v_mul_lo_u32 v11, v4, v3
; CGP-NEXT: v_mul_lo_u32 v6, v6, v2
-; CGP-NEXT: v_mul_lo_u32 v3, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v12, v2, v8
-; CGP-NEXT: v_mul_lo_u32 v13, v4, v8
-; CGP-NEXT: v_mul_hi_u32 v8, v4, v8
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v11, v3
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_lo_u32 v6, v2, v3
-; CGP-NEXT: v_mul_hi_u32 v11, v2, v3
-; CGP-NEXT: v_mul_lo_u32 v14, v4, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v4, v3
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v14
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v12, v6
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v13
; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CGP-NEXT: v_mul_hi_u32 v8, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v11, v2, v6
+; CGP-NEXT: v_mul_lo_u32 v12, v3, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v12
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_mul_hi_u32 v12, v2, v6
; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v8
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc
-; CGP-NEXT: v_mul_hi_u32 v4, v5, v2
-; CGP-NEXT: v_mul_lo_u32 v6, v7, v2
-; CGP-NEXT: v_mul_hi_u32 v2, v7, v2
-; CGP-NEXT: v_mul_lo_u32 v8, v5, v3
-; CGP-NEXT: v_mul_hi_u32 v11, v5, v3
-; CGP-NEXT: v_mul_lo_u32 v12, v7, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v7, v3
+; CGP-NEXT: v_mul_lo_u32 v11, v3, v6
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v12
+; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT: v_mul_hi_u32 v6, v3, v6
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v8, vcc, v11, v8
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v6, vcc
+; CGP-NEXT: v_mul_hi_u32 v4, v5, v2
+; CGP-NEXT: v_mul_lo_u32 v6, v5, v3
+; CGP-NEXT: v_mul_lo_u32 v8, v7, v2
+; CGP-NEXT: v_mul_hi_u32 v2, v7, v2
; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v8
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v12
+; CGP-NEXT: v_mul_hi_u32 v8, v5, v3
+; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; CGP-NEXT: v_mul_lo_u32 v6, v7, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v8
+; CGP-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v11, v6
+; CGP-NEXT: v_add_i32_e32 v6, vcc, v8, v6
+; CGP-NEXT: v_mul_hi_u32 v3, v7, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT: v_mul_lo_u32 v6, v9, v2
-; CGP-NEXT: v_mul_hi_u32 v8, v9, v2
-; CGP-NEXT: v_mul_lo_u32 v2, v10, v2
; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT: v_mul_hi_u32 v4, v9, v2
; CGP-NEXT: v_mul_lo_u32 v3, v9, v3
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v3
+; CGP-NEXT: v_mul_lo_u32 v6, v9, v2
+; CGP-NEXT: v_mul_lo_u32 v2, v10, v2
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v4, v3
; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
; CGP-NEXT: v_sub_i32_e32 v3, vcc, v5, v6
; CGP-NEXT: v_subb_u32_e64 v4, s[4:5], v7, v2, vcc
; CGP-NEXT: v_sub_i32_e64 v2, s[4:5], v7, v2
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v10
+; CGP-NEXT: v_cndmask_b32_e64 v5, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v9
; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v4, v10
; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v4, v10
-; CGP-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[4:5]
; CGP-NEXT: v_sub_i32_e32 v6, vcc, v3, v9
; CGP-NEXT: v_subbrev_u32_e64 v7, s[4:5], 0, v2, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v9
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v10
; CGP-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v9
; CGP-NEXT: v_subb_u32_e32 v2, vcc, v2, v10, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v7, v10
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
+; CGP-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
+; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v10
; CGP-NEXT: v_sub_i32_e32 v9, vcc, v6, v9
+; CGP-NEXT: v_cndmask_b32_e64 v8, v8, v11, s[4:5]
; CGP-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, v7, v10
-; CGP-NEXT: v_cndmask_b32_e32 v8, v11, v8, vcc
; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
; CGP-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
; CGP-NEXT: v_cndmask_b32_e32 v7, v7, v2, vcc
@@ -1930,14 +1930,14 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
; CGP-NEXT: s_cbranch_execz .LBB8_6
; CGP-NEXT: .LBB8_8:
; CGP-NEXT: v_rcp_iflag_f32_e32 v2, v4
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, 0, v9
-; CGP-NEXT: v_mov_b32_e32 v3, 0
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v9
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
-; CGP-NEXT: v_mul_hi_u32 v4, v2, v4
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_mul_lo_u32 v3, v3, v2
+; CGP-NEXT: v_mul_hi_u32 v3, v2, v3
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
; CGP-NEXT: v_mul_hi_u32 v2, v5, v2
+; CGP-NEXT: v_mov_b32_e32 v3, 0
; CGP-NEXT: v_mul_lo_u32 v2, v2, v9
; CGP-NEXT: v_sub_i32_e32 v2, vcc, v5, v2
; CGP-NEXT: v_sub_i32_e32 v4, vcc, v2, v9
@@ -1957,10 +1957,10 @@ define i64 @v_urem_i64_24bit(i64 %num, i64 %den) {
; GISEL-LABEL: v_urem_i64_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v2
; GISEL-NEXT: v_cvt_f32_u32_e32 v2, v1
; GISEL-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; GISEL-NEXT: v_rcp_iflag_f32_e32 v2, v2
; GISEL-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; GISEL-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -1982,10 +1982,10 @@ define i64 @v_urem_i64_24bit(i64 %num, i64 %den) {
; CGP-LABEL: v_urem_i64_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2
; CGP-NEXT: v_cvt_f32_u32_e32 v2, v1
; CGP-NEXT: v_sub_i32_e32 v3, vcc, 0, v1
+; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
; CGP-NEXT: v_rcp_f32_e32 v2, v2
; CGP-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
; CGP-NEXT: v_cvt_u32_f32_e32 v2, v2
@@ -2013,274 +2013,274 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-LABEL: v_urem_v2i64_24bit:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v4
-; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v6
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v6, 0
-; GISEL-NEXT: v_cvt_f32_u32_e32 v7, v3
-; GISEL-NEXT: v_sub_i32_e32 v4, vcc, 0, v3
-; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v8, v1
-; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v1
-; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_mac_f32_e32 v7, 0x4f800000, v6
-; GISEL-NEXT: v_mac_f32_e32 v8, 0x4f800000, v6
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v6, v7
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v8
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; GISEL-NEXT: v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v6
-; GISEL-NEXT: v_mul_f32_e32 v11, 0x2f800000, v7
+; GISEL-NEXT: v_and_b32_e32 v4, 0xffffff, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v3, v4
+; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v1, 0
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; GISEL-NEXT: v_mac_f32_e32 v3, 0x4f800000, v1
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
+; GISEL-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v3
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v3, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v3, v3
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v2
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v7, v3
+; GISEL-NEXT: v_mul_lo_u32 v12, v8, v3
+; GISEL-NEXT: v_mul_lo_u32 v11, v7, v3
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v12
+; GISEL-NEXT: v_mul_hi_u32 v10, v3, v11
+; GISEL-NEXT: v_mul_lo_u32 v12, v3, v9
+; GISEL-NEXT: v_mul_lo_u32 v13, v5, v11
+; GISEL-NEXT: v_mul_hi_u32 v11, v5, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v13, v3, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT: v_mul_lo_u32 v12, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v12, vcc, v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v3, v10
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc
+; GISEL-NEXT: v_mul_hi_u32 v3, v7, v10
+; GISEL-NEXT: v_mul_lo_u32 v9, v7, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v8, v10
+; GISEL-NEXT: v_mul_lo_u32 v7, v7, v10
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v9
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v3, v8
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v7
+; GISEL-NEXT: v_mul_lo_u32 v11, v10, v8
+; GISEL-NEXT: v_and_b32_e32 v3, 0xffffff, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v5, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v11, v6
+; GISEL-NEXT: v_mul_lo_u32 v11, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v11
+; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v6
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc
+; GISEL-NEXT: v_mul_hi_u32 v7, v0, v6
+; GISEL-NEXT: v_mul_lo_u32 v8, v0, v5
+; GISEL-NEXT: v_mul_hi_u32 v6, 0, v6
+; GISEL-NEXT: v_mul_hi_u32 v9, v0, v5
+; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v3
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT: v_mul_lo_u32 v8, 0, v5
+; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v4, v6
+; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5
+; GISEL-NEXT: v_mul_lo_u32 v8, v4, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, 0, v6
+; GISEL-NEXT: v_mac_f32_e32 v10, 0x4f800000, v1
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], 0, v5
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v1, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
+; GISEL-NEXT: v_cndmask_b32_e32 v9, -1, v9, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v8, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc
+; GISEL-NEXT: v_mul_f32_e32 v8, 0x2f800000, v1
; GISEL-NEXT: v_trunc_f32_e32 v8, v8
-; GISEL-NEXT: v_trunc_f32_e32 v11, v11
-; GISEL-NEXT: v_mac_f32_e32 v6, 0xcf800000, v8
+; GISEL-NEXT: v_mac_f32_e32 v1, 0xcf800000, v8
+; GISEL-NEXT: v_cvt_u32_f32_e32 v1, v1
; GISEL-NEXT: v_cvt_u32_f32_e32 v8, v8
-; GISEL-NEXT: v_mac_f32_e32 v7, 0xcf800000, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v11
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v4, v8
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v11
-; GISEL-NEXT: v_mul_lo_u32 v14, v4, v12
-; GISEL-NEXT: v_mul_hi_u32 v15, v4, v12
-; GISEL-NEXT: v_mul_lo_u32 v16, v5, v12
-; GISEL-NEXT: v_mul_lo_u32 v17, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v18, v9, v7
-; GISEL-NEXT: v_mul_lo_u32 v19, v10, v7
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; GISEL-NEXT: v_mul_hi_u32 v15, v12, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v18, v13
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v14
-; GISEL-NEXT: v_mul_hi_u32 v14, v8, v14
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v6, v16
-; GISEL-NEXT: v_mul_lo_u32 v6, v12, v16
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v15, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v18
-; GISEL-NEXT: v_mul_hi_u32 v6, v7, v17
-; GISEL-NEXT: v_mul_lo_u32 v18, v11, v17
-; GISEL-NEXT: v_mul_hi_u32 v17, v11, v17
-; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v19
-; GISEL-NEXT: v_mul_lo_u32 v19, v7, v13
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v18
-; GISEL-NEXT: v_mul_hi_u32 v6, v12, v16
-; GISEL-NEXT: v_mul_hi_u32 v18, v7, v13
-; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], v17, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v18
-; GISEL-NEXT: v_mul_lo_u32 v18, v8, v16
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v14, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v18, vcc, v6, v18
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v19, vcc, v19, v6
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; GISEL-NEXT: v_sub_i32_e32 v9, vcc, 0, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v7, -1, v7, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v10, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_mul_hi_u32 v11, v9, v1
+; GISEL-NEXT: v_mul_lo_u32 v12, v9, v8
+; GISEL-NEXT: v_mul_lo_u32 v14, v10, v1
+; GISEL-NEXT: v_mul_lo_u32 v13, v9, v1
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, v11, v14
+; GISEL-NEXT: v_mul_hi_u32 v12, v1, v13
+; GISEL-NEXT: v_mul_lo_u32 v14, v1, v11
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT: v_mul_lo_u32 v4, v8, v13
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v7, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v7, v8, v13
+; GISEL-NEXT: v_mul_hi_u32 v13, v1, v11
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v12, v4
+; GISEL-NEXT: v_mul_lo_u32 v12, v8, v11
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13
+; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v12, s[4:5], v13, v12
+; GISEL-NEXT: v_mul_hi_u32 v11, v8, v11
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v7, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v1, v4
+; GISEL-NEXT: v_addc_u32_e64 v7, s[4:5], v8, v7, s[4:5]
+; GISEL-NEXT: v_mul_hi_u32 v1, v9, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
+; GISEL-NEXT: v_mul_lo_u32 v10, v10, v4
+; GISEL-NEXT: v_mul_lo_u32 v9, v9, v4
+; GISEL-NEXT: v_add_i32_e64 v1, s[4:5], v1, v8
+; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v1, v10
+; GISEL-NEXT: v_mul_hi_u32 v10, v4, v9
+; GISEL-NEXT: v_mul_lo_u32 v11, v4, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v7, v9
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v10, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v7, v9
+; GISEL-NEXT: v_mul_hi_u32 v9, v4, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT: v_mul_lo_u32 v10, v7, v8
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v20, vcc, v20, v6
-; GISEL-NEXT: v_and_b32_e32 v6, 0xffffff, v0
-; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v2
-; GISEL-NEXT: v_mul_hi_u32 v2, v8, v16
-; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, v17, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v16
-; GISEL-NEXT: v_add_i32_e32 v16, vcc, v20, v18
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v14
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v16
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v15
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v8, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v4, v12
-; GISEL-NEXT: v_mul_hi_u32 v14, v4, v12
-; GISEL-NEXT: v_mul_lo_u32 v5, v5, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v11, v13, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v15, v9, v7
-; GISEL-NEXT: v_mul_lo_u32 v10, v10, v7
-; GISEL-NEXT: v_mul_lo_u32 v4, v4, v2
-; GISEL-NEXT: v_mul_hi_u32 v16, v12, v8
-; GISEL-NEXT: v_mul_lo_u32 v17, v2, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v2, v8
-; GISEL-NEXT: v_mul_lo_u32 v9, v9, v11
-; GISEL-NEXT: v_mul_hi_u32 v18, v7, v13
-; GISEL-NEXT: v_mul_lo_u32 v19, v11, v13
-; GISEL-NEXT: v_mul_hi_u32 v13, v11, v13
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v14, v4
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v15, v9
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v8, v6
; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v10
-; GISEL-NEXT: v_mul_lo_u32 v9, v12, v4
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v4
-; GISEL-NEXT: v_mul_lo_u32 v14, v2, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v2, v4
-; GISEL-NEXT: v_mul_lo_u32 v15, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v18, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19
-; GISEL-NEXT: v_mul_hi_u32 v15, v7, v5
-; GISEL-NEXT: v_mul_lo_u32 v19, v11, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v16, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v13, s[4:5], v13, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], v9, v17
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v8, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v14
-; GISEL-NEXT: v_add_i32_e32 v14, vcc, v18, v17
-; GISEL-NEXT: v_add_i32_e32 v15, vcc, v15, v19
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v14
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v2, v4, vcc
-; GISEL-NEXT: v_mul_hi_u32 v4, v6, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, 0, v8
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v13
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc
-; GISEL-NEXT: v_mul_hi_u32 v9, v0, v7
-; GISEL-NEXT: v_mul_hi_u32 v7, 0, v7
-; GISEL-NEXT: v_mul_lo_u32 v10, v6, v2
-; GISEL-NEXT: v_mul_hi_u32 v11, v6, v2
-; GISEL-NEXT: v_mul_lo_u32 v12, 0, v2
-; GISEL-NEXT: v_mul_hi_u32 v2, 0, v2
-; GISEL-NEXT: v_mul_lo_u32 v13, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v14, v0, v5
-; GISEL-NEXT: v_mul_lo_u32 v15, 0, v5
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v6, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v2, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v2, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, 0, v4
+; GISEL-NEXT: v_mul_hi_u32 v8, v2, v5
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_mul_lo_u32 v7, 0, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_mul_lo_u32 v2, v3, v2
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v14
-; GISEL-NEXT: v_mul_lo_u32 v5, v1, v5
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v15
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT: v_mul_lo_u32 v8, v3, v4
-; GISEL-NEXT: v_mul_hi_u32 v9, v3, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; GISEL-NEXT: v_mul_hi_u32 v6, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v3, v5
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v4
; GISEL-NEXT: v_mul_lo_u32 v4, 0, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v1, v7
-; GISEL-NEXT: v_mul_hi_u32 v11, v1, v7
-; GISEL-NEXT: v_mul_lo_u32 v7, 0, v7
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v7
-; GISEL-NEXT: v_sub_i32_e32 v5, vcc, v6, v8
-; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], 0, v2, vcc
-; GISEL-NEXT: v_sub_i32_e64 v2, s[4:5], 0, v2
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v10
-; GISEL-NEXT: v_subb_u32_e64 v9, s[6:7], 0, v4, s[4:5]
-; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], 0, v4
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v8, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v7, -1, v7, s[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GISEL-NEXT: v_subbrev_u32_e64 v0, vcc, 0, v0, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v5, v3
-; GISEL-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v8, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v12, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v11, -1, v11, vcc
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v10, v3
-; GISEL-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v2, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
-; GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v12, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v12, v1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v2, v14, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v13, v15, s[4:5]
-; GISEL-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v2, v8, v1, s[4:5]
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v10, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v3, v9, v3, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v7
+; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], 0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], 0, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v2, v3
+; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v7, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v8, -1, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v7, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v6, -1, v6, s[4:5]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_urem_v2i64_24bit:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v4
+; CGP-NEXT: v_cvt_f32_u32_e32 v3, v1
+; CGP-NEXT: v_and_b32_e32 v4, 0xffffff, v6
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, 0, v1
+; CGP-NEXT: v_rcp_f32_e32 v3, v3
+; CGP-NEXT: v_cvt_f32_u32_e32 v5, v4
; CGP-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; CGP-NEXT: v_and_b32_e32 v1, 0xffffff, v2
-; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v4
-; CGP-NEXT: v_and_b32_e32 v3, 0xffffff, v6
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, 0, v2
-; CGP-NEXT: v_cvt_f32_u32_e32 v6, v3
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v3
-; CGP-NEXT: v_rcp_f32_e32 v4, v4
-; CGP-NEXT: v_rcp_f32_e32 v6, v6
-; CGP-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; CGP-NEXT: v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; CGP-NEXT: v_cvt_u32_f32_e32 v4, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v4
-; CGP-NEXT: v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT: v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT: v_mul_hi_u32 v6, v0, v4
-; CGP-NEXT: v_mul_lo_u32 v4, 0, v4
-; CGP-NEXT: v_mul_hi_u32 v7, v1, v5
-; CGP-NEXT: v_mul_lo_u32 v5, 0, v5
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT: v_mul_lo_u32 v4, v4, v2
-; CGP-NEXT: v_mul_lo_u32 v5, v5, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v5
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
-; CGP-NEXT: v_sub_i32_e32 v4, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v5, vcc, v1, v3
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; CGP-NEXT: v_cndmask_b32_e32 v2, v1, v5, vcc
+; CGP-NEXT: v_sub_i32_e32 v7, vcc, 0, v4
+; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
+; CGP-NEXT: v_rcp_f32_e32 v5, v5
+; CGP-NEXT: v_and_b32_e32 v2, 0xffffff, v2
+; CGP-NEXT: v_mul_lo_u32 v6, v6, v3
+; CGP-NEXT: v_mul_f32_e32 v5, 0x4f7ffffe, v5
+; CGP-NEXT: v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT: v_mul_hi_u32 v6, v3, v6
+; CGP-NEXT: v_mul_lo_u32 v7, v7, v5
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT: v_mul_hi_u32 v6, v0, v3
+; CGP-NEXT: v_mul_lo_u32 v3, 0, v3
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v6, v3
+; CGP-NEXT: v_mul_lo_u32 v3, v3, v1
+; CGP-NEXT: v_mul_hi_u32 v6, v5, v7
+; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
+; CGP-NEXT: v_sub_i32_e32 v3, vcc, v0, v1
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v6
+; CGP-NEXT: v_mul_hi_u32 v5, v2, v3
+; CGP-NEXT: v_mul_lo_u32 v3, 0, v3
+; CGP-NEXT: v_sub_i32_e32 v6, vcc, v0, v1
+; CGP-NEXT: v_add_i32_e32 v3, vcc, v5, v3
+; CGP-NEXT: v_mul_lo_u32 v3, v3, v4
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v1
+; CGP-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; CGP-NEXT: v_sub_i32_e32 v1, vcc, v2, v3
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v4
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; CGP-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
+; CGP-NEXT: v_sub_i32_e32 v2, vcc, v1, v4
+; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v4
+; CGP-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
; CGP-NEXT: v_mov_b32_e32 v1, 0
; CGP-NEXT: v_mov_b32_e32 v3, 0
; CGP-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
index 5408ad0747b74..58b3330e52201 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/v_bfe_i32.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel < %s | FileCheck --check-prefix=PREGFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --global-isel < %s | FileCheck --check-prefix=PREGFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=hawaii < %s | FileCheck --check-prefix=PREGFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=fiji < %s | FileCheck --check-prefix=PREGFX9 %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa --global-isel -mcpu=gfx90a < %s | FileCheck --check-prefix=PREGFX9 %s
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void-inseltpoison.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void-inseltpoison.ll
index da502b1ffa9de..bd3e38c6c1060 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void-inseltpoison.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void-inseltpoison.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GCN %s
@array = external addrspace(4) constant [32 x [800 x i32]], align 4
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void.ll
index c363e81cf3c4d..88e5410f024d9 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-void.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx600 < %s | FileCheck -check-prefix=GCN %s
@array = external addrspace(4) constant [32 x [800 x i32]], align 4
More information about the llvm-branch-commits
mailing list