[llvm] e52acb8 - GlobalISel: Add shifts to constant_fold combine

Matt Arsenault via llvm-commits llvm-commits at lists.llvm.org
Thu Aug 24 05:10:14 PDT 2023


Author: Matt Arsenault
Date: 2023-08-24T08:09:57-04:00
New Revision: e52acb817d87a98ae6a2af675be2084160c18db4

URL: https://github.com/llvm/llvm-project/commit/e52acb817d87a98ae6a2af675be2084160c18db4
DIFF: https://github.com/llvm/llvm-project/commit/e52acb817d87a98ae6a2af675be2084160c18db4.diff

LOG: GlobalISel: Add shifts to constant_fold combine

Currently we're getting away with post-selection constant folding on
these (a hack which exists for the DAG).

https://reviews.llvm.org/D156534

Added: 
    llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-ashr.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-lshr.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-shl.mir

Modified: 
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/test/CodeGen/AArch64/funnel-shift.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 8607f300ee1a23..5cee4f7e15c5e1 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -907,7 +907,7 @@ def reassocs : GICombineGroup<[reassoc_ptradd, reassoc_comm_binops]>;
 // Constant fold operations.
 def constant_fold : GICombineRule<
   (defs root:$d, apint_matchinfo:$matchinfo),
-  (match (wip_match_opcode G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR):$d,
+  (match (wip_match_opcode G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR, G_SHL, G_LSHR, G_ASHR):$d,
    [{ return Helper.matchConstantFold(*${d}, ${matchinfo}); }]),
   (apply [{ Helper.replaceInstWithConstant(*${d}, ${matchinfo}); }])>;
 

diff  --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index 25861c1c6c2a5d..bdc7de9e853f40 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -192,16 +192,10 @@ define i8 @fshl_i8_const_fold_overshift_1() {
 }
 
 define i8 @fshl_i8_const_fold_overshift_2() {
-; CHECK-SD-LABEL: fshl_i8_const_fold_overshift_2:
-; CHECK-SD:       // %bb.0:
-; CHECK-SD-NEXT:    mov w0, #120 // =0x78
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: fshl_i8_const_fold_overshift_2:
-; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov w8, #15 // =0xf
-; CHECK-GI-NEXT:    lsl w0, w8, #3
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: fshl_i8_const_fold_overshift_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, #120 // =0x78
+; CHECK-NEXT:    ret
   %f = call i8 @llvm.fshl.i8(i8 15, i8 15, i8 11)
   ret i8 %f
 }
@@ -393,9 +387,7 @@ define i8 @fshr_i8_const_fold_overshift_2() {
 ;
 ; CHECK-GI-LABEL: fshr_i8_const_fold_overshift_2:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    mov w8, #15 // =0xf
-; CHECK-GI-NEXT:    lsl w9, w8, #5
-; CHECK-GI-NEXT:    orr w0, w9, w8, lsr #3
+; CHECK-GI-NEXT:    mov w0, #481 // =0x1e1
 ; CHECK-GI-NEXT:    ret
   %f = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11)
   ret i8 %f

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-ashr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-ashr.mir
new file mode 100644
index 00000000000000..2b10ee54eeebde
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-ashr.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:  constant_fold_ashr_s32_s32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_ashr_s32_s32
+    ; CHECK: %shift:_(s32) = G_CONSTANT i32 -482254
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(s32)
+    %val:_(s32) = G_CONSTANT i32 -123456789
+    %shift_amt:_(s32) = G_CONSTANT i32 8
+    %shift:_(s32) = G_ASHR %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...
+
+---
+name:  constant_fold_ashr_s16_s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_ashr_s16_s16
+    ; CHECK: %shift:_(s16) = G_CONSTANT i16 -772
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %shift(s16)
+    ; CHECK-NEXT: $vgpr0 = COPY %ext(s32)
+    %val:_(s16) = G_CONSTANT i16 -12345
+    %shift_amt:_(s16) = G_CONSTANT i16 4
+    %shift:_(s16) = G_ASHR %val, %shift_amt
+    %ext:_(s32) = G_ANYEXT %shift
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:  constant_fold_ashr_s64_s32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_ashr_s64_s32
+    ; CHECK: %shift:_(s64) = G_CONSTANT i64 -482254
+    ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %shift(s64)
+    %val:_(s64) = G_CONSTANT i64 -123456789
+    %shift_amt:_(s32) = G_CONSTANT i32 8
+    %shift:_(s64) = G_ASHR %val, %shift_amt
+    $vgpr0_vgpr1 = COPY %shift
+
+...
+
+---
+name:  constant_fold_ashr_v2s16_v2s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_ashr_v2s16_v2s16
+    ; CHECK: %val0:_(s16) = G_CONSTANT i16 1234
+    ; CHECK-NEXT: %val1:_(s16) = G_CONSTANT i16 -5678
+    ; CHECK-NEXT: %val:_(<2 x s16>) = G_BUILD_VECTOR %val0(s16), %val1(s16)
+    ; CHECK-NEXT: %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt(s16), %shift_amt_elt(s16)
+    ; CHECK-NEXT: %shift:_(<2 x s16>) = G_ASHR %val, %shift_amt(<2 x s16>)
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(<2 x s16>)
+    %val0:_(s16) = G_CONSTANT i16 1234
+    %val1:_(s16) = G_CONSTANT i16 -5678
+    %val:_(<2 x s16>) = G_BUILD_VECTOR %val0, %val1
+    %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt, %shift_amt_elt
+    %shift:_(<2 x s16>) = G_ASHR %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...
+
+---
+name:  constant_fold_ashr_v2s16_v2s16_undef_amt_elt
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_ashr_v2s16_v2s16_undef_amt_elt
+    ; CHECK: %val0:_(s16) = G_CONSTANT i16 1234
+    ; CHECK-NEXT: %val1:_(s16) = G_CONSTANT i16 -5678
+    ; CHECK-NEXT: %val:_(<2 x s16>) = G_BUILD_VECTOR %val0(s16), %val1(s16)
+    ; CHECK-NEXT: %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: %undef:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt(s16), %undef(s16)
+    ; CHECK-NEXT: %shift:_(<2 x s16>) = G_ASHR %val, %shift_amt(<2 x s16>)
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(<2 x s16>)
+    %val0:_(s16) = G_CONSTANT i16 1234
+    %val1:_(s16) = G_CONSTANT i16 -5678
+    %val:_(<2 x s16>) = G_BUILD_VECTOR %val0, %val1
+    %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    %undef:_(s16) = G_IMPLICIT_DEF
+    %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt, %undef
+    %shift:_(<2 x s16>) = G_ASHR %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...
+
+---
+name:  constant_fold_ashr_v2s16_v2s16_undef_val_elt
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_ashr_v2s16_v2s16_undef_val_elt
+    ; CHECK: %val0:_(s16) = G_CONSTANT i16 -1234
+    ; CHECK-NEXT: %undef:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %val:_(<2 x s16>) = G_BUILD_VECTOR %val0(s16), %undef(s16)
+    ; CHECK-NEXT: %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt(s16), %shift_amt_elt(s16)
+    ; CHECK-NEXT: %shift:_(<2 x s16>) = G_ASHR %val, %shift_amt(<2 x s16>)
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(<2 x s16>)
+    %val0:_(s16) = G_CONSTANT i16 -1234
+    %undef:_(s16) = G_IMPLICIT_DEF
+    %val:_(<2 x s16>) = G_BUILD_VECTOR %val0, %undef
+    %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt, %shift_amt_elt
+    %shift:_(<2 x s16>) = G_ASHR %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-lshr.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-lshr.mir
new file mode 100644
index 00000000000000..9e2dece906887b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-lshr.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:  constant_fold_lshr_s32_s32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_lshr_s32_s32
+    ; CHECK: %shift:_(s32) = G_CONSTANT i32 16294962
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(s32)
+    %val:_(s32) = G_CONSTANT i32 -123456789
+    %shift_amt:_(s32) = G_CONSTANT i32 8
+    %shift:_(s32) = G_LSHR %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...
+
+---
+name:  constant_fold_lshr_s16_s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_lshr_s16_s16
+    ; CHECK: %shift:_(s16) = G_CONSTANT i16 3324
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %shift(s16)
+    ; CHECK-NEXT: $vgpr0 = COPY %ext(s32)
+    %val:_(s16) = G_CONSTANT i16 -12345
+    %shift_amt:_(s16) = G_CONSTANT i16 4
+    %shift:_(s16) = G_LSHR %val, %shift_amt
+    %ext:_(s32) = G_ANYEXT %shift
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:  constant_fold_lshr_s64_s32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_lshr_s64_s32
+    ; CHECK: %shift:_(s64) = G_CONSTANT i64 72057594037445682
+    ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %shift(s64)
+    %val:_(s64) = G_CONSTANT i64 -123456789
+    %shift_amt:_(s32) = G_CONSTANT i32 8
+    %shift:_(s64) = G_LSHR %val, %shift_amt
+    $vgpr0_vgpr1 = COPY %shift
+
+...
+
+---
+name:  constant_fold_lshr_v2s16_v2s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_lshr_v2s16_v2s16
+    ; CHECK: %val0:_(s16) = G_CONSTANT i16 1234
+    ; CHECK-NEXT: %val1:_(s16) = G_CONSTANT i16 -5678
+    ; CHECK-NEXT: %val:_(<2 x s16>) = G_BUILD_VECTOR %val0(s16), %val1(s16)
+    ; CHECK-NEXT: %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt(s16), %shift_amt_elt(s16)
+    ; CHECK-NEXT: %shift:_(<2 x s16>) = G_LSHR %val, %shift_amt(<2 x s16>)
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(<2 x s16>)
+    %val0:_(s16) = G_CONSTANT i16 1234
+    %val1:_(s16) = G_CONSTANT i16 -5678
+    %val:_(<2 x s16>) = G_BUILD_VECTOR %val0, %val1
+    %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt, %shift_amt_elt
+    %shift:_(<2 x s16>) = G_LSHR %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...
+
+---
+name:  constant_fold_lshr_v2s16_v2s16_undef_amt_elt
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_lshr_v2s16_v2s16_undef_amt_elt
+    ; CHECK: %val0:_(s16) = G_CONSTANT i16 1234
+    ; CHECK-NEXT: %val1:_(s16) = G_CONSTANT i16 -5678
+    ; CHECK-NEXT: %val:_(<2 x s16>) = G_BUILD_VECTOR %val0(s16), %val1(s16)
+    ; CHECK-NEXT: %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: %undef:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt(s16), %undef(s16)
+    ; CHECK-NEXT: %shift:_(<2 x s16>) = G_LSHR %val, %shift_amt(<2 x s16>)
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(<2 x s16>)
+    %val0:_(s16) = G_CONSTANT i16 1234
+    %val1:_(s16) = G_CONSTANT i16 -5678
+    %val:_(<2 x s16>) = G_BUILD_VECTOR %val0, %val1
+    %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    %undef:_(s16) = G_IMPLICIT_DEF
+    %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt, %undef
+    %shift:_(<2 x s16>) = G_LSHR %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...
+
+---
+name:  constant_fold_lshr_v2s16_v2s16_undef_val_elt
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_lshr_v2s16_v2s16_undef_val_elt
+    ; CHECK: %val0:_(s16) = G_CONSTANT i16 -1234
+    ; CHECK-NEXT: %undef:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %val:_(<2 x s16>) = G_BUILD_VECTOR %val0(s16), %undef(s16)
+    ; CHECK-NEXT: %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt(s16), %shift_amt_elt(s16)
+    ; CHECK-NEXT: %shift:_(<2 x s16>) = G_LSHR %val, %shift_amt(<2 x s16>)
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(<2 x s16>)
+    %val0:_(s16) = G_CONSTANT i16 -1234
+    %undef:_(s16) = G_IMPLICIT_DEF
+    %val:_(<2 x s16>) = G_BUILD_VECTOR %val0, %undef
+    %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt, %shift_amt_elt
+    %shift:_(<2 x s16>) = G_LSHR %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-shl.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-shl.mir
new file mode 100644
index 00000000000000..6962a53dd331e3
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizercombiner-shl.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=amdgpu-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:  constant_fold_shl_s32_s32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_shl_s32_s32
+    ; CHECK: %shift:_(s32) = G_CONSTANT i32 1540166912
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(s32)
+    %val:_(s32) = G_CONSTANT i32 123456789
+    %shift_amt:_(s32) = G_CONSTANT i32 8
+    %shift:_(s32) = G_SHL %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...
+
+---
+name:  constant_fold_shl_s16_s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_shl_s16_s16
+    ; CHECK: %shift:_(s16) = G_CONSTANT i16 912
+    ; CHECK-NEXT: %ext:_(s32) = G_ANYEXT %shift(s16)
+    ; CHECK-NEXT: $vgpr0 = COPY %ext(s32)
+    %val:_(s16) = G_CONSTANT i16 12345
+    %shift_amt:_(s16) = G_CONSTANT i16 4
+    %shift:_(s16) = G_SHL %val, %shift_amt
+    %ext:_(s32) = G_ANYEXT %shift
+    $vgpr0 = COPY %ext
+
+...
+
+---
+name:  constant_fold_shl_s64_s32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_shl_s64_s32
+    ; CHECK: %shift:_(s64) = G_CONSTANT i64 31604937984
+    ; CHECK-NEXT: $vgpr0_vgpr1 = COPY %shift(s64)
+    %val:_(s64) = G_CONSTANT i64 123456789
+    %shift_amt:_(s32) = G_CONSTANT i32 8
+    %shift:_(s64) = G_SHL %val, %shift_amt
+    $vgpr0_vgpr1 = COPY %shift
+
+...
+
+---
+name:  constant_fold_shl_v2s16_v2s16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_shl_v2s16_v2s16
+    ; CHECK: %val0:_(s16) = G_CONSTANT i16 1234
+    ; CHECK-NEXT: %val1:_(s16) = G_CONSTANT i16 5678
+    ; CHECK-NEXT: %val:_(<2 x s16>) = G_BUILD_VECTOR %val0(s16), %val1(s16)
+    ; CHECK-NEXT: %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt(s16), %shift_amt_elt(s16)
+    ; CHECK-NEXT: %shift:_(<2 x s16>) = G_SHL %val, %shift_amt(<2 x s16>)
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(<2 x s16>)
+    %val0:_(s16) = G_CONSTANT i16 1234
+    %val1:_(s16) = G_CONSTANT i16 5678
+    %val:_(<2 x s16>) = G_BUILD_VECTOR %val0, %val1
+    %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt, %shift_amt_elt
+    %shift:_(<2 x s16>) = G_SHL %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...
+
+---
+name:  constant_fold_shl_v2s16_v2s16_undef_amt_elt
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_shl_v2s16_v2s16_undef_amt_elt
+    ; CHECK: %val0:_(s16) = G_CONSTANT i16 1234
+    ; CHECK-NEXT: %val1:_(s16) = G_CONSTANT i16 5678
+    ; CHECK-NEXT: %val:_(<2 x s16>) = G_BUILD_VECTOR %val0(s16), %val1(s16)
+    ; CHECK-NEXT: %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: %undef:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt(s16), %undef(s16)
+    ; CHECK-NEXT: %shift:_(<2 x s16>) = G_SHL %val, %shift_amt(<2 x s16>)
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(<2 x s16>)
+    %val0:_(s16) = G_CONSTANT i16 1234
+    %val1:_(s16) = G_CONSTANT i16 5678
+    %val:_(<2 x s16>) = G_BUILD_VECTOR %val0, %val1
+    %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    %undef:_(s16) = G_IMPLICIT_DEF
+    %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt, %undef
+    %shift:_(<2 x s16>) = G_SHL %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...
+
+---
+name:  constant_fold_shl_v2s16_v2s16_undef_val_elt
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name: constant_fold_shl_v2s16_v2s16_undef_val_elt
+    ; CHECK: %val0:_(s16) = G_CONSTANT i16 1234
+    ; CHECK-NEXT: %undef:_(s16) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: %val:_(<2 x s16>) = G_BUILD_VECTOR %val0(s16), %undef(s16)
+    ; CHECK-NEXT: %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt(s16), %shift_amt_elt(s16)
+    ; CHECK-NEXT: %shift:_(<2 x s16>) = G_SHL %val, %shift_amt(<2 x s16>)
+    ; CHECK-NEXT: $vgpr0 = COPY %shift(<2 x s16>)
+    %val0:_(s16) = G_CONSTANT i16 1234
+    %undef:_(s16) = G_IMPLICIT_DEF
+    %val:_(<2 x s16>) = G_BUILD_VECTOR %val0, %undef
+    %shift_amt_elt:_(s16) = G_CONSTANT i16 8
+    %shift_amt:_(<2 x s16>) = G_BUILD_VECTOR %shift_amt_elt, %shift_amt_elt
+    %shift:_(<2 x s16>) = G_SHL %val, %shift_amt
+    $vgpr0 = COPY %shift
+
+...

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index c52e3e21aa98b1..ab000d91a3ef23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -312,51 +312,52 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_add_i32 s8, 0, 0x1000
-; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GISEL-NEXT:    s_movk_i32 s8, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0x1000
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0xfffff000
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s8
-; GISEL-NEXT:    s_sub_i32 s4, 0, s8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v4
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GISEL-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, s4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, s4, v4
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v5
+; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v5
+; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, v5, s8
+; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 12, v5
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
+; GISEL-NEXT:    v_lshlrev_b32_e32 v9, 12, v4
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[4:5]
+; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v0
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[6:7], s8, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
+; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, s8, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v4, s8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v4
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GISEL-NEXT:    v_subrev_i32_e64 v6, s[4:5], s8, v0
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
-; GISEL-NEXT:    v_subrev_i32_e64 v7, s[6:7], s8, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32_pow2k_denom:
@@ -453,51 +454,51 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_add_i32 s8, 0, 0x12d8fb
-; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GISEL-NEXT:    s_mov_b32 s8, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffed2705
+; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s8
-; GISEL-NEXT:    s_sub_i32 s4, 0, s8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, s4, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, s4, v4
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, v5, s8
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
-; GISEL-NEXT:    v_mul_lo_u32 v8, v4, s8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v4
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v3
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v4
+; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GISEL-NEXT:    v_mul_lo_u32 v6, v4, s8
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v8, v3, s8
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v3
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; GISEL-NEXT:    v_subrev_i32_e32 v6, vcc, s8, v0
+; GISEL-NEXT:    v_cmp_le_u32_e64 s[6:7], s8, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
+; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GISEL-NEXT:    v_subrev_i32_e64 v6, s[4:5], s8, v0
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[4:5]
-; GISEL-NEXT:    v_subrev_i32_e64 v7, s[6:7], s8, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32_oddk_denom:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 648e5214e2b1cc..7a81bc52e9ebf1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1131,268 +1131,258 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s8, 0
-; GISEL-NEXT:    s_add_u32 s4, 0x1000, 0
-; GISEL-NEXT:    s_mov_b32 s9, s8
-; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[8:9]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s7
-; GISEL-NEXT:    s_sub_u32 s10, 0, s6
-; GISEL-NEXT:    s_subb_u32 s11, 0, s7
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s10, v8, v[5:6]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s11, v7, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v7, v4
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s10, v9, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s10, v8, v[4:5]
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v9, v[7:8]
+; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v10, v4
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v11, 0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s11, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v11, v[8:9]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v7, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v6
-; GISEL-NEXT:    v_xor_b32_e32 v11, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT:    v_xor_b32_e32 v9, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v0, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v8
+; GISEL-NEXT:    v_xor_b32_e32 v12, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v1, v5, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v8
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v6
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v1
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x1000
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v9, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6]
-; GISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v11, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[4:5]
-; GISEL-NEXT:    s_add_u32 s4, 0x1000, 0
-; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[8:9]
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
-; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
-; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v5
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v1
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s7, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GISEL-NEXT:    s_sub_u32 s6, 0, s8
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v5, v14, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v15, v[1:2]
-; GISEL-NEXT:    s_subb_u32 s7, 0, s9
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v13, v[5:6]
-; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v11, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v0
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v12, v14, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v10, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v5, v11, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[7:8]
+; GISEL-NEXT:    s_bfe_i32 s6, 1, 0x10000
+; GISEL-NEXT:    v_mov_b32_e32 v9, s6
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v12, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v7, s[4:5], v12, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, 0x1000
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v10
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v11, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v13, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v13
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v0
+; GISEL-NEXT:    s_bfe_i32 s4, 1, 0x10000
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v15, s4
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, v15, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2]
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7]
+; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v9, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v6
+; GISEL-NEXT:    v_mul_hi_u32 v15, v14, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v15, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v5, v15, v5
+; GISEL-NEXT:    v_mul_lo_u32 v15, v13, v6
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
+; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v15, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v10, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v12, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v7, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v10, v[5:6]
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v9, v11, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v10, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v8, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v8
+; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v6
+; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v3, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v9, v2, v7
-; GISEL-NEXT:    v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
-; GISEL-NEXT:    v_xor_b32_e32 v11, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v5
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
+; GISEL-NEXT:    v_xor_b32_e32 v9, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s8, v10, 0
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v7, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s8, v12, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s9, v10, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_mov_b32_e32 v8, s9
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v11, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v11, v3
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v4
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v2
-; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v2
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v9, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v12, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    s_bfe_i32 s6, 1, 0x10000
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v4
+; GISEL-NEXT:    v_mov_b32_e32 v7, s6
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v10
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v10
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v13, vcc
+; GISEL-NEXT:    s_bfe_i32 s4, 1, 0x10000
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v5, s4
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v6
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v7
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom:
@@ -1791,268 +1781,258 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_sdiv_v2i64_oddk_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s8, 0
-; GISEL-NEXT:    s_add_u32 s4, 0x12d8fb, 0
-; GISEL-NEXT:    s_mov_b32 s9, s8
-; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[8:9]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s7
-; GISEL-NEXT:    s_sub_u32 s10, 0, s6
-; GISEL-NEXT:    s_subb_u32 s11, 0, s7
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s10, v8, v[5:6]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s11, v7, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v7, v4
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s10, v9, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s10, v8, v[4:5]
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v9, v[7:8]
+; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v10, v4
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v7
+; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v9, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v11, 0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s11, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v11, v[8:9]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v7, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v6
-; GISEL-NEXT:    v_xor_b32_e32 v11, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT:    v_xor_b32_e32 v9, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v0, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v8
+; GISEL-NEXT:    v_xor_b32_e32 v12, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v1, v5, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v8
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v6
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v1
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v1
+; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v9, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6]
-; GISEL-NEXT:    v_mov_b32_e32 v1, s7
-; GISEL-NEXT:    v_subb_u32_e64 v6, s[4:5], v11, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, v7, v10, s[4:5]
-; GISEL-NEXT:    s_add_u32 s4, 0x12d8fb, 0
-; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[8:9]
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v5, v1, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v6, s9
-; GISEL-NEXT:    v_subrev_i32_e32 v0, vcc, s6, v0
-; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v5
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, 1, v8
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v1
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s7, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
-; GISEL-NEXT:    s_sub_u32 s6, 0, s8
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s7, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v5, v14, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v15, v[1:2]
-; GISEL-NEXT:    s_subb_u32 s7, 0, s9
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v13, v[5:6]
-; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v11, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v0
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v12, v14, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v10, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v5, v11, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[7:8]
+; GISEL-NEXT:    s_bfe_i32 s6, 1, 0x10000
+; GISEL-NEXT:    v_mov_b32_e32 v9, s6
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v12, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v7, s[4:5], v12, v7
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, 0x12d8fb
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, v9, v8, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v10
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, 0, v11, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v13, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v13
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v0
+; GISEL-NEXT:    s_bfe_i32 s4, 1, 0x10000
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
+; GISEL-NEXT:    v_mov_b32_e32 v15, s4
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, v15, v6, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2]
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7]
+; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v9, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v6
+; GISEL-NEXT:    v_mul_hi_u32 v15, v14, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v15, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_mul_hi_u32 v5, v15, v5
+; GISEL-NEXT:    v_mul_lo_u32 v15, v13, v6
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
+; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v15, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v10, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v8, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v12, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v7, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v10, v[5:6]
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v9, v11, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v13, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v10, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v8, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v8
+; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v6
+; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v3, v9, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v9, v2, v7
-; GISEL-NEXT:    v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v5
-; GISEL-NEXT:    v_xor_b32_e32 v11, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v3, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v5
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v13, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
+; GISEL-NEXT:    v_xor_b32_e32 v9, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s8, v10, 0
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v7, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s8, v12, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s9, v10, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_mov_b32_e32 v8, s9
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v11, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v11, v3
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v4
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v2
-; GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, s8, v2
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v9, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v12, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    s_bfe_i32 s6, 1, 0x10000
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v4
+; GISEL-NEXT:    v_mov_b32_e32 v7, s6
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v6, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v10
-; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, 0, v12, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s9, v3
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v2
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v7, v6, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v10
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, 0, v13, vcc
+; GISEL-NEXT:    s_bfe_i32 s4, 1, 0x10000
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s9, v3
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v6, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v5, s4
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v6
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v6, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v6, v3, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v12, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v7
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index 728e13788f9bc9..88ace1c51f5b02 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -286,47 +286,47 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_add_i32 s4, 0, 0x1000
-; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GISEL-NEXT:    s_movk_i32 s4, 0x1000
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xfffff000
+; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GISEL-NEXT:    s_sub_i32 s5, 0, s4
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, s5, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, s4
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, s4
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v0
-; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v1
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v3
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v4
+; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 12, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 12, v3
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
+; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v1
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v0
-; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
+; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v1
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i32_pow2k_denom:
@@ -417,47 +417,47 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_add_i32 s4, 0, 0x12d8fb
-; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GISEL-NEXT:    s_mov_b32 s4, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
+; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffed2705
+; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s4
-; GISEL-NEXT:    s_sub_i32 s5, 0, s4
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
-; GISEL-NEXT:    v_mul_f32_e32 v5, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v6, s5, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, s5, v4
-; GISEL-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v4, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v5
-; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v5, v5, s4
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v3
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v4
+; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GISEL-NEXT:    v_mul_lo_u32 v4, v4, s4
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v0
-; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v1
+; GISEL-NEXT:    v_mul_lo_u32 v3, v3, s4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
+; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v1
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v0
-; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v1
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
+; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v1
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i32_oddk_denom:

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index fe25c445218db9..751d8c66af927f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -1108,15 +1108,10 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_srem_v2i64_pow2k_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s8, 0
-; GISEL-NEXT:    s_add_u32 s4, 0x1000, 0
-; GISEL-NEXT:    s_mov_b32 s9, s8
-; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[8:9]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s7
-; GISEL-NEXT:    s_sub_u32 s10, 0, s6
-; GISEL-NEXT:    s_subb_u32 s11, 0, s7
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
@@ -1125,57 +1120,57 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s10, v8, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v7, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s11, v7, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v7, v[5:6]
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v7, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s10, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v9, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s10, v8, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v8, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s11, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v7, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v0, v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v11, v1, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v1, v9, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v1, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v6
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1183,191 +1178,181 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v1
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x1000
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v1
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v8, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v5, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v7, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6]
-; GISEL-NEXT:    v_mov_b32_e32 v9, s7
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v11, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v11, v5
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v1, v5, s[4:5]
-; GISEL-NEXT:    s_add_u32 s4, 0x1000, 0
-; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[8:9]
-; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v0, v9, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GISEL-NEXT:    v_subrev_i32_e32 v11, vcc, s6, v7
-; GISEL-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v6, v[1:2]
+; GISEL-NEXT:    s_bfe_i32 s6, 1, 0x10000
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7]
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v10, v0
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v11, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v11, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v11
-; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT:    v_mul_f32_e32 v13, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v13, v13
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v13
+; GISEL-NEXT:    v_mov_b32_e32 v10, s6
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v10, v1, s[4:5]
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, 0x1000
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v8, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v12
-; GISEL-NEXT:    s_sub_u32 s7, 0, s8
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, v1, v6, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v14, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, v5, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v13, v[1:2]
-; GISEL-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v11
-; GISEL-NEXT:    s_subb_u32 s6, 0, s9
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v14, v[5:6]
-; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v11, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v14, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v12, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v10, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2]
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v12, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7]
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v12, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v6
+; GISEL-NEXT:    v_mul_hi_u32 v16, v14, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v16, v15, v6
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; GISEL-NEXT:    v_mul_hi_u32 v6, v15, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v13, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v12, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v7, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v11, v[5:6]
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v15, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v12, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v14, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v8, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v12, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v8
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v6
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v9, v2, v7
-; GISEL-NEXT:    v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v5
-; GISEL-NEXT:    v_xor_b32_e32 v10, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v5
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v10, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_xor_b32_e32 v9, v9, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s8, v11, 0
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v12, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s8, v5, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v6, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s9, v11, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v4
-; GISEL-NEXT:    v_mov_b32_e32 v8, s9
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v2
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v9, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v12, v[6:7]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v4
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
-; GISEL-NEXT:    v_subrev_i32_e32 v6, vcc, s8, v2
-; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v9
-; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, s8, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[4:5]
+; GISEL-NEXT:    s_bfe_i32 s4, 1, 0x10000
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v10, s4
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v7
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1752,15 +1737,10 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_srem_v2i64_oddk_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s8, 0
-; GISEL-NEXT:    s_add_u32 s4, 0x12d8fb, 0
-; GISEL-NEXT:    s_mov_b32 s9, s8
-; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[6:7], s[4:5], s[8:9]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s6
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, s7
-; GISEL-NEXT:    s_sub_u32 s10, 0, s6
-; GISEL-NEXT:    s_subb_u32 s11, 0, s7
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
@@ -1769,57 +1749,57 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s10, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s10, v8, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v7, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s11, v7, v[5:6]
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v7, v[5:6]
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
+; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v7, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s10, v9, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v9, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s10, v8, v[4:5]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v8, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s11, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v7, v0, v4
+; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v4
 ; GISEL-NEXT:    v_mul_lo_u32 v0, v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, v9, v6
+; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v11, v1, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v1, v9, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v1, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v6
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
@@ -1827,191 +1807,181 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v7, v1
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_mul_hi_u32 v6, v7, v1
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v6
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v8, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v5, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v7, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6]
-; GISEL-NEXT:    v_mov_b32_e32 v9, s7
-; GISEL-NEXT:    v_subb_u32_e64 v8, s[4:5], v11, v5, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v11, v5
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v1, v5, s[4:5]
-; GISEL-NEXT:    s_add_u32 s4, 0x12d8fb, 0
-; GISEL-NEXT:    s_addc_u32 s5, 0, 0
-; GISEL-NEXT:    s_xor_b64 s[8:9], s[4:5], s[8:9]
-; GISEL-NEXT:    v_subb_u32_e32 v5, vcc, v0, v9, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, s8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, s9
-; GISEL-NEXT:    v_subrev_i32_e32 v11, vcc, s6, v7
-; GISEL-NEXT:    v_subbrev_u32_e64 v12, s[4:5], 0, v5, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s7, v12
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v6, v[1:2]
+; GISEL-NEXT:    s_bfe_i32 s6, 1, 0x10000
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7]
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v10, v0
+; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v11, v6, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v11, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v11
-; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT:    v_mul_f32_e32 v13, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v13, v13
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v13
+; GISEL-NEXT:    v_mov_b32_e32 v10, s6
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, v10, v1, s[4:5]
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v6
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v8, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
+; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s7, v12
-; GISEL-NEXT:    s_sub_u32 s7, 0, s8
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, v1, v6, s[4:5]
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v14, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, v5, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v13, v[1:2]
-; GISEL-NEXT:    v_subrev_i32_e32 v1, vcc, s6, v11
-; GISEL-NEXT:    s_subb_u32 s6, 0, s9
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v14, v[5:6]
-; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v11, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v14, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v12, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v6
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; GISEL-NEXT:    v_cndmask_b32_e32 v16, v10, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2]
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v12, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7]
+; GISEL-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v13, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v12, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v12, v14, v6
+; GISEL-NEXT:    v_mul_hi_u32 v16, v14, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v5
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v16, v15, v6
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_mul_hi_u32 v12, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
-; GISEL-NEXT:    v_mul_hi_u32 v5, v13, v5
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; GISEL-NEXT:    v_mul_hi_u32 v6, v15, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v13, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v6, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v12, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v7, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v11, v[5:6]
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v15, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v12, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v14, v[1:2]
+; GISEL-NEXT:    v_xor_b32_e32 v1, v8, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v12, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v13, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v8
+; GISEL-NEXT:    v_mul_lo_u32 v2, v14, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v6
+; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v8
+; GISEL-NEXT:    v_mul_hi_u32 v3, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v9, v2, v7
-; GISEL-NEXT:    v_mul_lo_u32 v2, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v5
-; GISEL-NEXT:    v_xor_b32_e32 v10, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v5
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v5
+; GISEL-NEXT:    v_mul_lo_u32 v3, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT:    v_mul_hi_u32 v5, v12, v5
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v12, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v10, v0
-; GISEL-NEXT:    v_mul_lo_u32 v5, v9, v2
-; GISEL-NEXT:    v_mul_hi_u32 v6, v9, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v10, v0
-; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v4
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v14, v2, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_xor_b32_e32 v9, v9, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v2
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
-; GISEL-NEXT:    v_mul_hi_u32 v5, v9, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v13, v2
+; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v2
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s8, v11, 0
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v7, v13, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v12, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s8, v5, v[0:1]
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v6, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s9, v11, v[5:6]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v10, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v10, v3
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v4
-; GISEL-NEXT:    v_mov_b32_e32 v8, s9
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v2
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v9, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v12, v[6:7]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v4
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
-; GISEL-NEXT:    v_subrev_i32_e32 v6, vcc, s8, v2
-; GISEL-NEXT:    v_subbrev_u32_e64 v9, s[4:5], 0, v3, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s9, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v6
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s9, v9
-; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, s8, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[4:5]
+; GISEL-NEXT:    s_bfe_i32 s4, 1, 0x10000
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v10, s4
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v7
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v7
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v7, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:


        


More information about the llvm-commits mailing list