[llvm] 0d518ae - [GlobalISel] New combine to commute constant operands to the RHS

Jay Foad via llvm-commits llvm-commits at lists.llvm.org
Thu Jan 5 03:13:05 PST 2023


Author: Jay Foad
Date: 2023-01-05T11:12:40Z
New Revision: 0d518ae50cbad6dce0397cb5a754b72f3fcf1ced

URL: https://github.com/llvm/llvm-project/commit/0d518ae50cbad6dce0397cb5a754b72f3fcf1ced
DIFF: https://github.com/llvm/llvm-project/commit/0d518ae50cbad6dce0397cb5a754b72f3fcf1ced.diff

LOG: [GlobalISel] New combine to commute constant operands to the RHS

Differential Revision: https://reviews.llvm.org/D140907

Added: 
    

Modified: 
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/test/CodeGen/AArch64/GlobalISel/combine-add-of-sub.mir
    llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir
    llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 373b8d39d98cc..7aed4982cda3e 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -343,6 +343,24 @@ def select_to_logical : GICombineRule<
   (apply [{ Helper.applyBuildFn(*${root}, ${matchinfo}); }])
 >;
 
+// Fold (C op x) -> (x op C)
+// TODO: handle more isCommutable opcodes
+// TODO: handle compares (currently not marked as isCommutable)
+def commute_constant_to_rhs : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_ADD, G_MUL, G_AND, G_OR, G_XOR):$root, [{
+    return getIConstantVRegVal(${root}->getOperand(1).getReg(), MRI).has_value();
+  }]),
+  (apply [{
+    Observer.changingInstr(*${root});
+    Register LHSReg = ${root}->getOperand(1).getReg();
+    Register RHSReg = ${root}->getOperand(2).getReg();
+    ${root}->getOperand(1).setReg(RHSReg);
+    ${root}->getOperand(2).setReg(LHSReg);
+    Observer.changedInstr(*${root});
+  }])
+>;
+
 // Fold x op 0 -> x
 def right_identity_zero: GICombineRule<
   (defs root:$root),
@@ -1086,7 +1104,7 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     intdiv_combines, mulh_combines, redundant_neg_operands,
     and_or_disjoint_mask, fma_combines, fold_binop_into_select,
     sub_add_reg, select_to_minmax, redundant_binop_in_equality,
-    fsub_to_fneg]>;
+    fsub_to_fneg, commute_constant_to_rhs]>;
 
 // A combine group used to for prelegalizer combiners at -O0. The combines in
 // this group have been selected based on experiments to balance code size and

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add-of-sub.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add-of-sub.mir
index c6e5a615f58f5..ac42d2da16d56 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-add-of-sub.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-add-of-sub.mir
@@ -163,7 +163,7 @@ body:             |
     ; CHECK-NEXT: %x1:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: %x2:_(s32) = G_CONSTANT i32 2
     ; CHECK-NEXT: %y:_(s32) = COPY $w1
-    ; CHECK-NEXT: %add:_(s32) = G_ADD %x1, %y
+    ; CHECK-NEXT: %add:_(s32) = G_ADD %y, %x1
     ; CHECK-NEXT: %sub:_(s32) = G_SUB %add, %x2
     ; CHECK-NEXT: $w0 = COPY %sub(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
@@ -351,7 +351,7 @@ body:             |
     ; CHECK-NEXT: %x1:_(s32) = G_CONSTANT i32 1
     ; CHECK-NEXT: %x2:_(s32) = G_CONSTANT i32 2
     ; CHECK-NEXT: %y:_(s32) = COPY $w1
-    ; CHECK-NEXT: %add:_(s32) = G_ADD %x1, %y
+    ; CHECK-NEXT: %add:_(s32) = G_ADD %y, %x1
     ; CHECK-NEXT: %sub:_(s32) = G_SUB %x2, %add
     ; CHECK-NEXT: $w0 = COPY %sub(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir
index fca840d80f2d6..96a6f18b1d410 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-immed-mismatch-crash.mir
@@ -13,19 +13,19 @@ body:             |
   ; CHECK:   liveins: $x0
   ; CHECK:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
   ; CHECK:   [[DEF1:%[0-9]+]]:_(p0) = G_IMPLICIT_DEF
-  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-  ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
+  ; CHECK:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 9
   ; CHECK:   G_BRCOND [[DEF]](s1), %bb.2
   ; CHECK:   G_BR %bb.1
   ; CHECK: bb.1:
   ; CHECK:   successors:
   ; CHECK: bb.2:
   ; CHECK:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[DEF1]](p0) :: (load (s32) from `ptr undef`, align 8)
-  ; CHECK:   [[MUL:%[0-9]+]]:_(s32) = nsw G_MUL [[C]], [[LOAD]]
-  ; CHECK:   [[MUL1:%[0-9]+]]:_(s32) = nsw G_MUL [[MUL]], [[C1]]
+  ; CHECK:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 4
+  ; CHECK:   [[SHL:%[0-9]+]]:_(s32) = nsw G_SHL [[LOAD]], [[C1]](s32)
+  ; CHECK:   [[MUL:%[0-9]+]]:_(s32) = nsw G_MUL [[SHL]], [[C]]
   ; CHECK:   [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-  ; CHECK:   [[SHL:%[0-9]+]]:_(s32) = G_SHL [[MUL1]], [[C2]](s64)
-  ; CHECK:   $w0 = COPY [[SHL]](s32)
+  ; CHECK:   [[SHL1:%[0-9]+]]:_(s32) = G_SHL [[MUL]], [[C2]](s64)
+  ; CHECK:   $w0 = COPY [[SHL1]](s32)
   ; CHECK:   RET_ReallyLR implicit $w0
   bb.1:
     liveins: $x0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
index b633fff9ae508..9f3ad8b444446 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fold-binop-into-select.mir
@@ -46,7 +46,7 @@ body: |
     ; CHECK-NEXT: %twenty:_(s32) = G_CONSTANT i32 20
     ; CHECK-NEXT: %select:_(s32) = G_SELECT %cond(s1), %ten, %twenty
     ; CHECK-NEXT: %thirty:_(s32) = G_CONSTANT i32 30
-    ; CHECK-NEXT: %add:_(s32) = G_ADD %thirty, %select
+    ; CHECK-NEXT: %add:_(s32) = G_ADD %select, %thirty
     ; CHECK-NEXT: S_ENDPGM 0, implicit %add(s32), implicit %select(s32)
     %reg:_(s32) = COPY $vgpr0
     %zero:_(s32) = G_CONSTANT i32 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 37b7eacc028ed..78a5bc7c34f23 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -14,7 +14,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x60001
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, -7, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, -7
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -45,7 +45,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX8-NEXT:    v_mul_lo_u32 v1, -7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, -7
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -76,7 +76,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX9-NEXT:    v_mul_lo_u32 v1, -7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, -7
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -107,7 +107,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v1, -7, v0
+; GFX10-NEXT:    v_mul_lo_u32 v1, v0, -7
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -141,7 +141,7 @@ define amdgpu_ps i7 @s_fshl_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    v_mul_lo_u32 v1, -7, v0
+; GFX11-NEXT:    v_mul_lo_u32 v1, v0, -7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
@@ -183,7 +183,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 6
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_lo_u32 v4, -7, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v3, -7
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -213,7 +213,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_lo_u32 v4, -7, v3
+; GFX8-NEXT:    v_mul_lo_u32 v4, v3, -7
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -243,7 +243,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 1, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, -7, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v3, -7
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -274,7 +274,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX10-NEXT:    v_mul_lo_u32 v4, -7, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, v3, -7
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -308,7 +308,7 @@ define i7 @v_fshl_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX11-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX11-NEXT:    v_mul_lo_u32 v4, -7, v3
+; GFX11-NEXT:    v_mul_lo_u32 v4, v3, -7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v4
@@ -1513,7 +1513,7 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -1543,7 +1543,7 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -1573,7 +1573,7 @@ define amdgpu_ps i24 @s_fshl_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    s_bfe_u32 s1, s1, 0x170001
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -1673,7 +1673,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_bfe_u32 v1, v1, 1, 23
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_lo_u32 v4, v4, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -1703,7 +1703,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_bfe_u32 v1, v1, 1, 23
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_lo_u32 v4, v4, v3
+; GFX8-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -1733,7 +1733,7 @@ define i24 @v_fshl_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_bfe_u32 v1, v1, 1, 23
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -1854,7 +1854,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s7, s2, 24
 ; GFX6-NEXT:    s_and_b32 s9, s2, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x80008
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX6-NEXT:    s_or_b32 s2, s9, s2
@@ -1892,7 +1892,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s8, s5, 8
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
@@ -1988,7 +1988,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX8-NEXT:    s_bfe_u32 s6, s6, 0x100000
-; GFX8-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 8
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
@@ -2023,7 +2023,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s9, s5, 8
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, s10
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
@@ -2099,7 +2099,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_bfe_u32 s0, s0, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX9-NEXT:    s_or_b32 s0, s0, s7
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, s12
 ; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
@@ -2146,7 +2146,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_bfe_u32 s7, s7, 0x100000
 ; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX9-NEXT:    s_or_b32 s4, s4, s7
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX9-NEXT:    s_lshr_b32 s11, s5, 8
@@ -2485,14 +2485,14 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX6-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX6-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v9
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX6-NEXT:    v_mul_lo_u32 v7, v7, v8
+; GFX6-NEXT:    v_mul_lo_u32 v7, v8, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
@@ -2539,14 +2539,14 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX8-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX8-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; GFX8-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v9
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX8-NEXT:    v_mul_lo_u32 v7, v7, v8
+; GFX8-NEXT:    v_mul_lo_u32 v7, v8, v7
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
 ; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
@@ -2593,10 +2593,10 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
-; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX9-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v9
+; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX9-NEXT:    v_bfe_u32 v3, v3, 1, 23
 ; GFX9-NEXT:    v_mul_hi_u32 v7, v9, v7

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 501ed2fe4aba1..374455a65bdcf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -15,7 +15,7 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX6-NEXT:    v_mul_lo_u32 v1, -7, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, -7
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -45,7 +45,7 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX8-NEXT:    v_mul_lo_u32 v1, -7, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, -7
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -75,7 +75,7 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0x7f
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_mul_lo_u32 v1, -7, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, -7
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -105,7 +105,7 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    v_mul_lo_u32 v1, -7, v0
+; GFX10-NEXT:    v_mul_lo_u32 v1, v0, -7
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -138,7 +138,7 @@ define amdgpu_ps i7 @s_fshr_i7(i7 inreg %lhs, i7 inreg %rhs, i7 inreg %amt) {
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_lo_u32 v1, -7, v0
+; GFX11-NEXT:    v_mul_lo_u32 v1, v0, -7
 ; GFX11-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
@@ -180,7 +180,7 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX6-NEXT:    v_mul_lo_u32 v4, -7, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v3, -7
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -210,7 +210,7 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX8-NEXT:    v_mul_lo_u32 v4, -7, v3
+; GFX8-NEXT:    v_mul_lo_u32 v4, v3, -7
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -240,7 +240,7 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0x7f, v1
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX9-NEXT:    v_mul_lo_u32 v4, -7, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v3, -7
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -271,7 +271,7 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GFX10-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GFX10-NEXT:    v_mul_lo_u32 v4, -7, v3
+; GFX10-NEXT:    v_mul_lo_u32 v4, v3, -7
 ; GFX10-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v4
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -305,7 +305,7 @@ define i7 @v_fshr_i7(i7 %lhs, i7 %rhs, i7 %amt) {
 ; GFX11-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_lo_u32 v4, -7, v3
+; GFX11-NEXT:    v_mul_lo_u32 v4, v3, -7
 ; GFX11-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_nc_u32_e32 v3, v3, v4
@@ -1513,7 +1513,7 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xffffff
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -1544,7 +1544,7 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s1, s1, 0xffffff
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -1575,7 +1575,7 @@ define amdgpu_ps i24 @s_fshr_i24(i24 inreg %lhs, i24 inreg %rhs, i24 inreg %amt)
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v0
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s2, v0
@@ -1678,7 +1678,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX6-NEXT:    v_mul_lo_u32 v4, v4, v3
+; GFX6-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX6-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -1709,7 +1709,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX8-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX8-NEXT:    v_mul_lo_u32 v4, v4, v3
+; GFX8-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -1740,7 +1740,7 @@ define i24 @v_fshr_i24(i24 %lhs, i24 %rhs, i24 %amt) {
 ; GFX9-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, v4, v3
+; GFX9-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GFX9-NEXT:    v_add_u32_e32 v3, v3, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v2, v3
@@ -1854,7 +1854,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s9, s2, 24
 ; GFX6-NEXT:    s_and_b32 s11, s2, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s2, s2, 0x80008
-; GFX6-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
 ; GFX6-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX6-NEXT:    s_or_b32 s2, s11, s2
@@ -1892,7 +1892,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s10, s5, 8
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s4, v0
 ; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v0
@@ -1989,7 +1989,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_and_b32 s8, s9, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX8-NEXT:    s_bfe_u32 s8, s8, 0x100000
-; GFX8-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX8-NEXT:    s_lshr_b32 s12, s3, 8
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x100000
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
@@ -2024,7 +2024,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s12, s5, 8
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, s10
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s4, v0
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v0
@@ -2099,7 +2099,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s7, s0, 8
 ; GFX9-NEXT:    s_lshr_b32 s10, s0, 24
-; GFX9-NEXT:    v_mul_lo_u32 v2, v1, v0
+; GFX9-NEXT:    v_mul_lo_u32 v2, v0, v1
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, s12
 ; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX9-NEXT:    s_or_b32 s1, s10, s1
@@ -2145,7 +2145,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_or_b32 s4, s4, s10
 ; GFX9-NEXT:    s_and_b32 s10, s11, 0xff
 ; GFX9-NEXT:    s_bfe_u32 s10, s10, 0x100000
-; GFX9-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX9-NEXT:    v_mul_lo_u32 v1, v2, v1
 ; GFX9-NEXT:    s_bfe_u32 s4, s4, 0x100000
 ; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
 ; GFX9-NEXT:    s_or_b32 s4, s4, s10
@@ -2496,7 +2496,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX6-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX6-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
@@ -2513,7 +2513,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX6-NEXT:    v_mul_lo_u32 v6, v7, v8
+; GFX6-NEXT:    v_mul_lo_u32 v6, v8, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
 ; GFX6-NEXT:    v_mul_hi_u32 v6, v8, v6
@@ -2552,7 +2552,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
@@ -2569,7 +2569,7 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
-; GFX8-NEXT:    v_mul_lo_u32 v6, v7, v8
+; GFX8-NEXT:    v_mul_lo_u32 v6, v8, v7
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v6, v8, v6
@@ -2608,9 +2608,9 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    v_mul_f32_e32 v9, 0x4f7ffffe, v9
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v9, v9
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX9-NEXT:    v_mul_lo_u32 v8, v7, v6
+; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v7
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX9-NEXT:    v_mul_lo_u32 v7, v7, v9
+; GFX9-NEXT:    v_mul_lo_u32 v7, v9, v7
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
index a2aa740410ab9..6eed92ba1d71c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
@@ -678,7 +678,7 @@ define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr
 ; GFX6-NEXT:    s_load_dword s0, s[6:7], 0x0
 ; GFX6-NEXT:    s_mov_b32 s6, -1
 ; GFX6-NEXT:    s_mov_b32 s7, 0xf000
-; GFX6-NEXT:    v_mul_lo_u32 v1, -2, v0
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, -2
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x100001
 ; GFX6-NEXT:    s_ashr_i32 s2, s0, 31

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index 3d63978c5f3a4..cbc0c545439c8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -112,9 +112,8 @@ define amdgpu_kernel void @v_mul_i64_zext_10(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v3, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v4, v0, 0
-; GFX10-NEXT:    v_mul_lo_u32 v1, v4, v1
-; GFX10-NEXT:    v_mul_lo_u32 v0, 0, v0
-; GFX10-NEXT:    v_add3_u32 v3, v3, v1, v0
+; GFX10-NEXT:    v_mul_lo_u32 v0, v4, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -131,11 +130,9 @@ define amdgpu_kernel void @v_mul_i64_zext_10(ptr addrspace(1) %out, ptr addrspac
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v4, v0, 0
-; GFX11-NEXT:    v_mul_lo_u32 v1, v4, v1
-; GFX11-NEXT:    v_mul_lo_u32 v0, 0, v0
+; GFX11-NEXT:    v_mul_lo_u32 v0, v4, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v1, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
 ; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -164,8 +161,6 @@ define amdgpu_kernel void @v_mul_i64_zext_11(ptr addrspace(1) %out, ptr addrspac
 ; GFX10-NEXT:    global_load_dword v2, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, v1, v2, 0
-; GFX10-NEXT:    v_mul_lo_u32 v2, 0, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -176,15 +171,13 @@ define amdgpu_kernel void @v_mul_i64_zext_11(ptr addrspace(1) %out, ptr addrspac
 ; GFX11-NEXT:    s_load_b128 s[4:7], s[0:1], 0x24
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x34
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_load_b32 v1, v0, s[6:7]
-; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-NEXT:    global_load_b32 v0, v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v2, 0
-; GFX11-NEXT:    v_mul_lo_u32 v2, 0, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_add_nc_u32 v1, v1, v2
+; GFX11-NEXT:    v_mad_u64_u32 v[0:1], null, v1, v0, 0
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
@@ -214,9 +207,8 @@ define amdgpu_kernel void @v_mul_i64_and_a_hi(ptr addrspace(1) %out, ptr addrspa
 ; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_mad_u64_u32 v[2:3], s0, v4, v0, 0
-; GFX10-NEXT:    v_mul_lo_u32 v1, v4, v1
-; GFX10-NEXT:    v_mul_lo_u32 v0, 0, v0
-; GFX10-NEXT:    v_add3_u32 v3, v3, v1, v0
+; GFX10-NEXT:    v_mul_lo_u32 v0, v4, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v3, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    global_store_dwordx2 v0, v[2:3], s[4:5]
 ; GFX10-NEXT:    s_endpgm
@@ -233,11 +225,9 @@ define amdgpu_kernel void @v_mul_i64_and_a_hi(ptr addrspace(1) %out, ptr addrspa
 ; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[0:1]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mad_u64_u32 v[2:3], null, v4, v0, 0
-; GFX11-NEXT:    v_mul_lo_u32 v1, v4, v1
-; GFX11-NEXT:    v_mul_lo_u32 v0, 0, v0
+; GFX11-NEXT:    v_mul_lo_u32 v0, v4, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add3_u32 v3, v3, v1, v0
-; GFX11-NEXT:    v_mov_b32_e32 v0, 0
+; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
 ; GFX11-NEXT:    global_store_b64 v0, v[2:3], s[4:5]
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir
index f97c238427b91..600cefbcc161a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/postlegalizer-combiner-trunc-bitcast-buildvector.mir
@@ -13,7 +13,7 @@ body: |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 42
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C]], [[TRUNC]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -48,7 +48,7 @@ body: |
     ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; CHECK-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
     ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 42
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C1]], [[TRUNC2]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC2]], [[C1]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -78,7 +78,7 @@ body: |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr1
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 42
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C]], [[TRUNC]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
     %0:_(s32) = COPY $vgpr0
@@ -111,7 +111,7 @@ body: |
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s64) = G_BITCAST [[BUILD_VECTOR]](<2 x s32>)
     ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s64)
     ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 42
-    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[C]], [[TRUNC]]
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s16) = G_OR [[TRUNC]], [[C]]
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[OR]](s16)
     ; CHECK-NEXT: $vgpr0 = COPY [[ZEXT]](s32)
     %0:_(s32) = COPY $vgpr0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 78a0048443dc3..c52e3e21aa98b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -54,13 +54,9 @@ define i32 @v_sdiv_i32(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
-; CGP-NEXT:    v_mul_lo_u32 v5, 0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, 0, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -130,13 +126,9 @@ define amdgpu_ps i32 @s_sdiv_i32(i32 inreg %num, i32 inreg %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, s1, v0
-; CGP-NEXT:    v_mul_lo_u32 v2, 0, v1
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_lo_u32 v1, 0, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, s0, v0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, v0, s2
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
@@ -247,20 +239,12 @@ define <2 x i32> @v_sdiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, 0, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v10, v5, v3
@@ -302,7 +286,7 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
@@ -328,7 +312,7 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_add_i32 s8, 0x1000, 0
+; GISEL-NEXT:    s_add_i32 s8, 0, 0x1000
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s8
@@ -394,8 +378,8 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v8, s4, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v7
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, s4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v4
 ; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
@@ -443,7 +427,7 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
@@ -469,7 +453,7 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_add_i32 s8, 0x12d8fb, 0
+; GISEL-NEXT:    s_add_i32 s8, 0, 0x12d8fb
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s8
@@ -535,8 +519,8 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v8, s4, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v7
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, s4
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v4
 ; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
@@ -706,20 +690,12 @@ define <2 x i32> @v_sdiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; CGP-NEXT:    v_mul_lo_u32 v6, v6, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, v9, v7
-; CGP-NEXT:    v_mul_lo_u32 v10, 0, v6
 ; CGP-NEXT:    v_mul_hi_u32 v6, v5, v6
-; CGP-NEXT:    v_mul_lo_u32 v11, 0, v9
 ; CGP-NEXT:    v_mul_hi_u32 v9, v7, v9
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v0, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v6
 ; CGP-NEXT:    v_mul_hi_u32 v6, v1, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
 ; CGP-NEXT:    v_mul_lo_u32 v7, v5, v2
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, 1, v5
 ; CGP-NEXT:    v_mul_lo_u32 v10, v6, v3
@@ -797,13 +773,9 @@ define i32 @v_sdiv_i32_24bit(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
-; CGP-NEXT:    v_mul_lo_u32 v4, 0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, 0, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -906,20 +878,12 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, 0, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v8, v5, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index b902dc471e37a..728e13788f9bc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -50,13 +50,9 @@ define i32 @v_srem_i32(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_mul_lo_u32 v4, v4, v3
-; CGP-NEXT:    v_mul_lo_u32 v5, 0, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT:    v_mul_lo_u32 v4, 0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v0, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v1
@@ -120,13 +116,9 @@ define amdgpu_ps i32 @s_srem_i32(i32 inreg %num, i32 inreg %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, s3, v0
-; CGP-NEXT:    v_mul_lo_u32 v2, 0, v1
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_lo_u32 v1, 0, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, s0, v0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CGP-NEXT:    v_mul_lo_u32 v0, v0, s1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; CGP-NEXT:    v_subrev_i32_e32 v1, vcc, s1, v0
@@ -227,20 +219,12 @@ define <2 x i32> @v_srem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v9, v9, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v5, v7
-; CGP-NEXT:    v_mul_lo_u32 v11, 0, v9
 ; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v9
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v0, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v2
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
@@ -278,7 +262,7 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
@@ -302,7 +286,7 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_add_i32 s4, 0x1000, 0
+; GISEL-NEXT:    s_add_i32 s4, 0, 0x1000
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s4
@@ -364,8 +348,8 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v8, s5, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v7
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, s5
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v4
 ; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
@@ -409,7 +393,7 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT:    v_mul_lo_u32 v3, v3, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
@@ -433,7 +417,7 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_add_i32 s4, 0x12d8fb, 0
+; GISEL-NEXT:    s_add_i32 s4, 0, 0x12d8fb
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, s4
@@ -495,8 +479,8 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v8, s5, v3
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, v7
+; CGP-NEXT:    v_mul_lo_u32 v8, v3, s5
+; CGP-NEXT:    v_mul_lo_u32 v4, v7, v4
 ; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
 ; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
@@ -651,20 +635,12 @@ define <2 x i32> @v_srem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
 ; CGP-NEXT:    v_mul_lo_u32 v9, v9, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v11, 0, v9
 ; CGP-NEXT:    v_mul_hi_u32 v9, v8, v9
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
 ; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v9
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v6
 ; CGP-NEXT:    v_mul_hi_u32 v6, v0, v6
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_mul_lo_u32 v6, v6, v2
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
@@ -735,13 +711,9 @@ define i32 @v_srem_i32_24bit(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
-; CGP-NEXT:    v_mul_lo_u32 v4, 0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, 0, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_mul_lo_u32 v2, v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
@@ -836,20 +808,12 @@ define <2 x i32> @v_srem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, 0, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v4, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
index 5721a4c9483d5..926c3d59e2e46 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
@@ -38,13 +38,9 @@ define i32 @v_udiv_i32(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
-; CGP-NEXT:    v_mul_lo_u32 v4, 0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, 0, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -96,13 +92,9 @@ define amdgpu_ps i32 @s_udiv_i32(i32 inreg %num, i32 inreg %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, s2, v0
-; CGP-NEXT:    v_mul_lo_u32 v2, 0, v1
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_lo_u32 v1, 0, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, s0, v0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, v0, s1
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, 1, v0
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
@@ -179,20 +171,12 @@ define <2 x i32> @v_udiv_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, 0, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v8, v5, v3
@@ -367,20 +351,12 @@ define <2 x i32> @v_udiv_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, 0, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v8, v5, v3
@@ -445,13 +421,9 @@ define i32 @v_udiv_i32_24bit(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
-; CGP-NEXT:    v_mul_lo_u32 v4, 0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, 0, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v2, v1
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
@@ -536,20 +508,12 @@ define <2 x i32> @v_udiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, 0, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v6, v4, v2
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; CGP-NEXT:    v_mul_lo_u32 v8, v5, v3

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 004c6d7ab9ada..48f05a33f0364 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -36,13 +36,9 @@ define i32 @v_urem_i32(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
-; CGP-NEXT:    v_mul_lo_u32 v4, 0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, 0, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_mul_lo_u32 v2, v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
@@ -90,13 +86,9 @@ define amdgpu_ps i32 @s_urem_i32(i32 inreg %num, i32 inreg %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; CGP-NEXT:    v_mul_lo_u32 v1, s2, v0
-; CGP-NEXT:    v_mul_lo_u32 v2, 0, v1
 ; CGP-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_mul_lo_u32 v1, 0, v0
 ; CGP-NEXT:    v_mul_hi_u32 v0, s0, v0
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CGP-NEXT:    v_mul_lo_u32 v0, v0, s1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, s0, v0
 ; CGP-NEXT:    v_subrev_i32_e32 v1, vcc, s1, v0
@@ -167,20 +159,12 @@ define <2 x i32> @v_urem_v2i32(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, 0, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v4, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
@@ -232,7 +216,7 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0xffed2705
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v1
+; CHECK-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
@@ -260,7 +244,7 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v3
+; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v3
@@ -291,7 +275,7 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-NEXT:    s_mov_b32 s5, 0xffed2705
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT:    v_mul_lo_u32 v3, s5, v2
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, s5
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v0, v2
@@ -404,20 +388,12 @@ define <2 x i32> @v_urem_v2i32_pow2_shl_denom(<2 x i32> %x, <2 x i32> %y) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, 0, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v4, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
@@ -476,13 +452,9 @@ define i32 @v_urem_i32_24bit(i32 %num, i32 %den) {
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; CGP-NEXT:    v_mul_lo_u32 v3, v3, v2
-; CGP-NEXT:    v_mul_lo_u32 v4, 0, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT:    v_mul_lo_u32 v3, 0, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; CGP-NEXT:    v_mul_lo_u32 v2, v2, v1
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v0, v1
@@ -561,20 +533,12 @@ define <2 x i32> @v_urem_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v4
 ; CGP-NEXT:    v_mul_lo_u32 v7, v7, v6
-; CGP-NEXT:    v_mul_lo_u32 v8, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, 0, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
-; CGP-NEXT:    v_mul_lo_u32 v6, 0, v4
 ; CGP-NEXT:    v_mul_hi_u32 v4, v0, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, 0, v5
 ; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
 ; CGP-NEXT:    v_mul_lo_u32 v4, v4, v2
 ; CGP-NEXT:    v_mul_lo_u32 v5, v5, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 2178cf15794f4..261482c97e3e3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -985,58 +985,56 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v6
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v8, s5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, -1, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, s5, v3
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_mul_lo_u32 v7, v6, s5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, s5
+; CHECK-NEXT:    v_mul_hi_u32 v9, s5, v3
+; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v8
-; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v8
+; CHECK-NEXT:    v_mul_hi_u32 v10, v3, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v7
+; CHECK-NEXT:    v_mul_lo_u32 v11, v3, v7
 ; CHECK-NEXT:    v_mul_lo_u32 v12, v6, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v3, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v11
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
 ; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v7, s5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v8, -1, v3
-; CHECK-NEXT:    v_mul_hi_u32 v9, s5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v10, s5, v6
-; CHECK-NEXT:    v_mul_lo_u32 v11, v6, v7
-; CHECK-NEXT:    v_mul_hi_u32 v12, v3, v7
+; CHECK-NEXT:    v_mul_lo_u32 v7, v3, s5
+; CHECK-NEXT:    v_mul_hi_u32 v8, s5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v9, v6, s5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v7
+; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v7
 ; CHECK-NEXT:    v_mul_hi_u32 v7, v6, v7
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CHECK-NEXT:    v_sub_i32_e32 v9, vcc, v9, v3
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; CHECK-NEXT:    v_mul_lo_u32 v9, v3, v8
-; CHECK-NEXT:    v_mul_lo_u32 v10, v6, v8
+; CHECK-NEXT:    v_mul_lo_u32 v12, v6, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v13, v3, v8
 ; CHECK-NEXT:    v_mul_hi_u32 v8, v6, v8
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v12, v11
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
@@ -1063,12 +1061,10 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_mul_lo_u32 v8, s4, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, 0, v3
+; CHECK-NEXT:    v_mul_lo_u32 v8, v3, s4
 ; CHECK-NEXT:    v_mul_hi_u32 v3, s4, v3
 ; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_mul_lo_u32 v6, s4, v6
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
+; CHECK-NEXT:    v_mul_lo_u32 v6, v6, s4
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; CHECK-NEXT:    v_subb_u32_e64 v6, s[4:5], v1, v3, vcc
@@ -1288,19 +1284,15 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; GISEL-NEXT:    v_mul_lo_u32 v14, s8, v8
-; GISEL-NEXT:    v_mul_lo_u32 v15, 0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v14, v8, s8
 ; GISEL-NEXT:    v_mul_hi_u32 v8, s8, v8
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_mul_lo_u32 v12, s8, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, 0, v7
+; GISEL-NEXT:    v_mul_lo_u32 v12, v7, s8
 ; GISEL-NEXT:    v_mul_hi_u32 v7, s8, v7
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; GISEL-NEXT:    v_mul_lo_u32 v9, s8, v9
-; GISEL-NEXT:    v_mul_lo_u32 v10, s8, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v15, v9
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
+; GISEL-NEXT:    v_mul_lo_u32 v9, v9, s8
+; GISEL-NEXT:    v_mul_lo_u32 v10, v10, s8
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
@@ -1380,121 +1372,117 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v10
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, s6, v8
+; CGP-NEXT:    v_mul_lo_u32 v11, v8, s6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v10
-; CGP-NEXT:    v_mul_lo_u32 v13, s6, v5
-; CGP-NEXT:    v_mul_lo_u32 v14, -1, v5
-; CGP-NEXT:    v_mul_hi_u32 v15, s6, v5
-; CGP-NEXT:    v_mul_lo_u32 v16, s6, v7
-; CGP-NEXT:    v_mul_lo_u32 v17, -1, v7
-; CGP-NEXT:    v_mul_hi_u32 v18, s6, v7
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v10, s6
+; CGP-NEXT:    v_mul_lo_u32 v13, v5, s6
+; CGP-NEXT:    v_mul_hi_u32 v14, s6, v5
+; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v11, v5
+; CGP-NEXT:    v_mul_lo_u32 v15, v7, s6
+; CGP-NEXT:    v_mul_hi_u32 v16, s6, v7
+; CGP-NEXT:    v_sub_i32_e32 v12, vcc, v12, v7
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; CGP-NEXT:    v_mul_lo_u32 v14, v8, v13
-; CGP-NEXT:    v_mul_hi_u32 v19, v5, v13
+; CGP-NEXT:    v_mul_hi_u32 v17, v5, v13
 ; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT:    v_mul_lo_u32 v17, v10, v16
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; CGP-NEXT:    v_mul_hi_u32 v15, v7, v16
-; CGP-NEXT:    v_mul_hi_u32 v16, v10, v16
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
-; CGP-NEXT:    v_mul_lo_u32 v18, v7, v12
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
-; CGP-NEXT:    v_mul_lo_u32 v15, v5, v11
-; CGP-NEXT:    v_mul_lo_u32 v17, v8, v11
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_mul_lo_u32 v16, v10, v15
+; CGP-NEXT:    v_mul_hi_u32 v18, v7, v15
+; CGP-NEXT:    v_mul_hi_u32 v15, v10, v15
+; CGP-NEXT:    v_mul_lo_u32 v19, v7, v12
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
+; CGP-NEXT:    v_mul_lo_u32 v16, v5, v11
+; CGP-NEXT:    v_mul_lo_u32 v18, v8, v11
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v17
 ; CGP-NEXT:    v_mul_hi_u32 v14, v5, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v19
-; CGP-NEXT:    v_mul_lo_u32 v19, v10, v12
-; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v17, v13
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v17
+; CGP-NEXT:    v_mul_lo_u32 v17, v10, v12
+; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v18, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e64 v13, s[4:5], v13, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v17, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v18, v17
-; CGP-NEXT:    v_mul_hi_u32 v18, v7, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v10, v12
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v19, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v18
+; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v18, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v16, v17
+; CGP-NEXT:    v_mul_hi_u32 v19, v7, v12
+; CGP-NEXT:    v_mul_hi_u32 v12, v10, v12
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v17, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v18, v17
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v18
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v13
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v11, vcc
-; CGP-NEXT:    v_mul_lo_u32 v11, s6, v5
-; CGP-NEXT:    v_mul_lo_u32 v13, -1, v5
-; CGP-NEXT:    v_mul_hi_u32 v14, s6, v5
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, s6
+; CGP-NEXT:    v_mul_hi_u32 v13, s6, v5
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v15
 ; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v10, v12, vcc
-; CGP-NEXT:    v_mul_lo_u32 v12, s6, v7
-; CGP-NEXT:    v_mul_lo_u32 v15, -1, v7
-; CGP-NEXT:    v_mul_hi_u32 v16, s6, v7
-; CGP-NEXT:    v_mul_lo_u32 v17, s6, v8
-; CGP-NEXT:    v_mul_lo_u32 v18, v8, v11
-; CGP-NEXT:    v_mul_hi_u32 v19, v5, v11
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, s6
+; CGP-NEXT:    v_mul_hi_u32 v14, s6, v7
+; CGP-NEXT:    v_mul_lo_u32 v15, v8, s6
+; CGP-NEXT:    v_mul_lo_u32 v16, v8, v11
+; CGP-NEXT:    v_mul_hi_u32 v17, v5, v11
 ; CGP-NEXT:    v_mul_hi_u32 v11, v8, v11
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
-; CGP-NEXT:    v_mul_lo_u32 v17, s6, v10
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
-; CGP-NEXT:    v_mul_lo_u32 v17, v10, v12
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_mul_hi_u32 v14, v7, v12
+; CGP-NEXT:    v_mul_lo_u32 v18, v10, s6
+; CGP-NEXT:    v_mul_lo_u32 v19, v10, v12
+; CGP-NEXT:    v_sub_i32_e32 v15, vcc, v15, v5
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; CGP-NEXT:    v_mul_hi_u32 v15, v7, v12
 ; CGP-NEXT:    v_mul_hi_u32 v12, v10, v12
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT:    v_mul_lo_u32 v16, v7, v15
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT:    v_mul_lo_u32 v14, v5, v13
-; CGP-NEXT:    v_mul_lo_u32 v16, v8, v13
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v18, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v14, v19
-; CGP-NEXT:    v_mul_hi_u32 v14, v5, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v18, s[4:5], v18, v19
-; CGP-NEXT:    v_mul_lo_u32 v19, v10, v15
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v16, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[4:5]
-; CGP-NEXT:    v_add_i32_e64 v14, s[4:5], v16, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_mul_hi_u32 v17, v7, v15
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
+; CGP-NEXT:    v_sub_i32_e32 v18, vcc, v18, v7
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v18, v14
+; CGP-NEXT:    v_mul_lo_u32 v18, v7, v14
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
 ; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v18, v15
+; CGP-NEXT:    v_mul_lo_u32 v15, v5, v13
+; CGP-NEXT:    v_mul_lo_u32 v18, v8, v13
+; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v16, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v15, v17
+; CGP-NEXT:    v_mul_hi_u32 v15, v5, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v16, s[4:5], v16, v17
+; CGP-NEXT:    v_mul_lo_u32 v17, v10, v14
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v18, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e64 v15, s[4:5], v18, v15
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
+; CGP-NEXT:    v_mul_hi_u32 v19, v7, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v19, v17
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v19
 ; CGP-NEXT:    v_mov_b32_e32 v19, s7
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
-; CGP-NEXT:    v_mov_b32_e32 v18, s9
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
-; CGP-NEXT:    v_mul_hi_u32 v15, v10, v15
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v17, v16
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v16
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
+; CGP-NEXT:    v_mov_b32_e32 v16, s9
+; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v10, v14
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
 ; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v13, vcc
 ; CGP-NEXT:    v_mul_lo_u32 v11, v1, v5
@@ -1506,8 +1494,8 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_mul_hi_u32 v14, v2, v7
 ; CGP-NEXT:    v_mul_hi_u32 v7, v3, v7
 ; CGP-NEXT:    v_mul_lo_u32 v15, v0, v8
-; CGP-NEXT:    v_mul_lo_u32 v16, v1, v8
-; CGP-NEXT:    v_mul_hi_u32 v17, v0, v8
+; CGP-NEXT:    v_mul_lo_u32 v17, v1, v8
+; CGP-NEXT:    v_mul_hi_u32 v18, v0, v8
 ; CGP-NEXT:    v_mul_hi_u32 v8, v1, v8
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
@@ -1519,38 +1507,34 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_add_i32_e64 v11, s[4:5], v11, v14
 ; CGP-NEXT:    v_mul_hi_u32 v11, v2, v10
 ; CGP-NEXT:    v_mul_hi_u32 v10, v3, v10
-; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v16, v5
+; CGP-NEXT:    v_add_i32_e64 v5, s[6:7], v17, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[6:7]
 ; CGP-NEXT:    v_add_i32_e64 v7, s[6:7], v13, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s[6:7]
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[4:5]
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, v15, v17
+; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[4:5]
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v16
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v18
+; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_mul_lo_u32 v14, s8, v5
-; CGP-NEXT:    v_mul_lo_u32 v15, 0, v5
+; CGP-NEXT:    v_mul_lo_u32 v14, v5, s8
 ; CGP-NEXT:    v_mul_hi_u32 v5, s8, v5
 ; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, s8, v7
-; CGP-NEXT:    v_mul_lo_u32 v16, 0, v7
+; CGP-NEXT:    v_mul_lo_u32 v12, v7, s8
 ; CGP-NEXT:    v_mul_hi_u32 v7, s8, v7
 ; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
-; CGP-NEXT:    v_mul_lo_u32 v8, s8, v8
-; CGP-NEXT:    v_mul_lo_u32 v10, s8, v10
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
+; CGP-NEXT:    v_mul_lo_u32 v8, v8, s8
+; CGP-NEXT:    v_mul_lo_u32 v10, v10, s8
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
@@ -1582,7 +1566,7 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v7, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v13, v18, v13, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v13, v16, v13, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v12, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v15, vcc, 0, v3, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9


        


More information about the llvm-commits mailing list