[llvm] 8abd700 - [TargetLowering] Teach BuildUDIV to take advantage of leading zeros in the dividend.

Craig Topper via llvm-commits llvm-commits at lists.llvm.org
Thu Dec 29 13:59:13 PST 2022


Author: Craig Topper
Date: 2022-12-29T13:58:46-08:00
New Revision: 8abd70081f761738e82b37b2891b60ad034f3880

URL: https://github.com/llvm/llvm-project/commit/8abd70081f761738e82b37b2891b60ad034f3880
DIFF: https://github.com/llvm/llvm-project/commit/8abd70081f761738e82b37b2891b60ad034f3880.diff

LOG: [TargetLowering] Teach BuildUDIV to take advantage of leading zeros in the dividend.

If the dividend has leading zeros, we can use them to reduce the
size of the multiplier and avoid the fixup cases.

This patch is for scalars only, but we might be able to do this
for vectors in a follow up.

Differential Revision: https://reviews.llvm.org/D140750

Added: 
    

Modified: 
    llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
    llvm/test/CodeGen/AArch64/funnel-shift.ll
    llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
    llvm/test/CodeGen/AMDGPU/fshr.ll
    llvm/test/CodeGen/ARM/select-imm.ll
    llvm/test/CodeGen/PowerPC/funnel-shift.ll
    llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
    llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
    llvm/test/CodeGen/X86/divide-by-constant.ll
    llvm/test/CodeGen/X86/funnel-shift.ll
    llvm/test/CodeGen/X86/pr38217.ll
    llvm/unittests/Support/DivisionByConstantTest.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index dbf318a85e9ed..ad7ea4f79ed22 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6016,6 +6016,23 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
       return SDValue();
   }
 
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Try to use leading zeros of the dividend to reduce the multiplier and
+  // avoid expensive fixups.
+  // TODO: Support vectors.
+  unsigned LeadingZeros = 0;
+  if (!VT.isVector() && isa<ConstantSDNode>(N1)) {
+    assert(!isOneConstant(N1) && "Unexpected divisor");
+    LeadingZeros = DAG.computeKnownBits(N0).countMinLeadingZeros();
+    // UnsignedDivisionByConstantInfo doesn't work correctly if leading zeros in
+    // the dividend exceeds the leading zeros for the divisor.
+    LeadingZeros =
+        std::min(LeadingZeros,
+                 cast<ConstantSDNode>(N1)->getAPIntValue().countLeadingZeros());
+  }
+
   bool UseNPQ = false;
   SmallVector<SDValue, 16> PreShifts, PostShifts, MagicFactors, NPQFactors;
 
@@ -6026,7 +6043,7 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     // bits are known to be zero.
     const APInt& Divisor = C->getAPIntValue();
     UnsignedDivisionByConstantInfo magics =
-        UnsignedDivisionByConstantInfo::get(Divisor);
+        UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros);
     unsigned PreShift = 0, PostShift = 0;
 
     // If the divisor is even, we can avoid using the expensive fixup by
@@ -6034,8 +6051,8 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     if (magics.IsAdd && !Divisor[0]) {
       PreShift = Divisor.countTrailingZeros();
       // Get magic number for the shifted divisor.
-      magics =
-          UnsignedDivisionByConstantInfo::get(Divisor.lshr(PreShift), PreShift);
+      magics = UnsignedDivisionByConstantInfo::get(Divisor.lshr(PreShift),
+                                                   PreShift + LeadingZeros);
       assert(!magics.IsAdd && "Should use cheap fixup now");
     }
 
@@ -6061,9 +6078,6 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
     return true;
   };
 
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
   // Collect the shifts/magic values from each element.
   if (!ISD::matchUnaryPredicate(N1, BuildUDIVPattern))
     return SDValue();

diff  --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index 6dfc61046c5e8..a7f3aed163910 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -69,15 +69,14 @@ declare i37 @llvm.fshl.i37(i37, i37, i37)
 define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
 ; CHECK-LABEL: fshl_i37:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #31883
+; CHECK-NEXT:    mov x9, #46053
 ; CHECK-NEXT:    and x8, x2, #0x1fffffffff
-; CHECK-NEXT:    movk x9, #3542, lsl #16
+; CHECK-NEXT:    movk x9, #12398, lsl #16
 ; CHECK-NEXT:    ubfiz x10, x1, #26, #37
-; CHECK-NEXT:    movk x9, #51366, lsl #32
-; CHECK-NEXT:    movk x9, #56679, lsl #48
+; CHECK-NEXT:    movk x9, #15941, lsl #32
+; CHECK-NEXT:    movk x9, #1771, lsl #48
 ; CHECK-NEXT:    umulh x8, x8, x9
 ; CHECK-NEXT:    mov w9, #37
-; CHECK-NEXT:    ubfx x8, x8, #5, #27
 ; CHECK-NEXT:    msub w8, w8, w9, w2
 ; CHECK-NEXT:    mvn w9, w8
 ; CHECK-NEXT:    lsl x8, x0, x8
@@ -207,16 +206,15 @@ declare i37 @llvm.fshr.i37(i37, i37, i37)
 define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
 ; CHECK-LABEL: fshr_i37:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov x9, #31883
+; CHECK-NEXT:    mov x9, #46053
 ; CHECK-NEXT:    and x8, x2, #0x1fffffffff
-; CHECK-NEXT:    movk x9, #3542, lsl #16
+; CHECK-NEXT:    movk x9, #12398, lsl #16
 ; CHECK-NEXT:    lsl x10, x1, #27
-; CHECK-NEXT:    movk x9, #51366, lsl #32
+; CHECK-NEXT:    movk x9, #15941, lsl #32
 ; CHECK-NEXT:    lsl x11, x0, #1
-; CHECK-NEXT:    movk x9, #56679, lsl #48
+; CHECK-NEXT:    movk x9, #1771, lsl #48
 ; CHECK-NEXT:    umulh x8, x8, x9
 ; CHECK-NEXT:    mov w9, #37
-; CHECK-NEXT:    lsr x8, x8, #5
 ; CHECK-NEXT:    msub w8, w8, w9, w2
 ; CHECK-NEXT:    add w8, w8, #27
 ; CHECK-NEXT:    mvn w9, w8

diff  --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
index 1f95ad911f5d2..1763a983e526c 100644
--- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll
@@ -6,40 +6,36 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-NEXT:    umov w8, v0.h[0]
-; CHECK-NEXT:    mov w9, #8969
-; CHECK-NEXT:    movk w9, #22765, lsl #16
+; CHECK-NEXT:    mov w9, #55879
+; CHECK-NEXT:    movk w9, #689, lsl #16
 ; CHECK-NEXT:    umov w10, v0.h[1]
-; CHECK-NEXT:    mov w12, #16913
-; CHECK-NEXT:    mov w13, #95
-; CHECK-NEXT:    movk w12, #8456, lsl #16
+; CHECK-NEXT:    mov w11, #33826
+; CHECK-NEXT:    mov w12, #95
+; CHECK-NEXT:    movk w11, #528, lsl #16
+; CHECK-NEXT:    umov w13, v0.h[2]
 ; CHECK-NEXT:    umull x9, w8, w9
-; CHECK-NEXT:    ubfx w14, w10, #2, #14
+; CHECK-NEXT:    umull x11, w10, w11
 ; CHECK-NEXT:    lsr x9, x9, #32
-; CHECK-NEXT:    sub w11, w8, w9
-; CHECK-NEXT:    umull x12, w14, w12
-; CHECK-NEXT:    add w9, w9, w11, lsr #1
-; CHECK-NEXT:    umov w11, v0.h[2]
-; CHECK-NEXT:    lsr w9, w9, #6
-; CHECK-NEXT:    lsr x12, x12, #34
-; CHECK-NEXT:    msub w8, w9, w13, w8
-; CHECK-NEXT:    mov w9, #33437
-; CHECK-NEXT:    movk w9, #21399, lsl #16
-; CHECK-NEXT:    mov w13, #124
-; CHECK-NEXT:    umull x9, w11, w9
-; CHECK-NEXT:    msub w10, w12, w13, w10
-; CHECK-NEXT:    umov w12, v0.h[3]
+; CHECK-NEXT:    lsr x11, x11, #32
+; CHECK-NEXT:    msub w8, w9, w12, w8
+; CHECK-NEXT:    mov w9, #48149
+; CHECK-NEXT:    movk w9, #668, lsl #16
+; CHECK-NEXT:    mov w12, #124
+; CHECK-NEXT:    umull x9, w13, w9
+; CHECK-NEXT:    msub w10, w11, w12, w10
+; CHECK-NEXT:    umov w11, v0.h[3]
 ; CHECK-NEXT:    fmov s0, w8
-; CHECK-NEXT:    mov w13, #2287
-; CHECK-NEXT:    lsr x8, x9, #37
+; CHECK-NEXT:    mov w12, #22281
+; CHECK-NEXT:    lsr x8, x9, #32
 ; CHECK-NEXT:    mov w9, #98
-; CHECK-NEXT:    movk w13, #16727, lsl #16
-; CHECK-NEXT:    msub w8, w8, w9, w11
+; CHECK-NEXT:    movk w12, #65, lsl #16
+; CHECK-NEXT:    msub w8, w8, w9, w13
 ; CHECK-NEXT:    mov v0.h[1], w10
-; CHECK-NEXT:    umull x9, w12, w13
+; CHECK-NEXT:    umull x9, w11, w12
 ; CHECK-NEXT:    mov w10, #1003
-; CHECK-NEXT:    lsr x9, x9, #40
+; CHECK-NEXT:    lsr x9, x9, #32
 ; CHECK-NEXT:    mov v0.h[2], w8
-; CHECK-NEXT:    msub w8, w9, w10, w12
+; CHECK-NEXT:    msub w8, w9, w10, w11
 ; CHECK-NEXT:    mov v0.h[3], w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -51,40 +47,28 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; CHECK-LABEL: fold_urem_vec_2:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w10, v0.h[0]
-; CHECK-NEXT:    mov w8, #8969
-; CHECK-NEXT:    umov w9, v0.h[1]
-; CHECK-NEXT:    movk w8, #22765, lsl #16
-; CHECK-NEXT:    umov w15, v0.h[2]
-; CHECK-NEXT:    umov w16, v0.h[3]
-; CHECK-NEXT:    umull x12, w10, w8
-; CHECK-NEXT:    umull x11, w9, w8
-; CHECK-NEXT:    lsr x12, x12, #32
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    mov w9, #55879
+; CHECK-NEXT:    movk w9, #689, lsl #16
+; CHECK-NEXT:    umov w10, v0.h[1]
+; CHECK-NEXT:    mov w12, #95
+; CHECK-NEXT:    umov w13, v0.h[2]
+; CHECK-NEXT:    umull x11, w8, w9
+; CHECK-NEXT:    umull x14, w10, w9
 ; CHECK-NEXT:    lsr x11, x11, #32
-; CHECK-NEXT:    sub w14, w10, w12
-; CHECK-NEXT:    sub w13, w9, w11
-; CHECK-NEXT:    add w12, w12, w14, lsr #1
-; CHECK-NEXT:    umull x14, w15, w8
-; CHECK-NEXT:    add w11, w11, w13, lsr #1
-; CHECK-NEXT:    mov w13, #95
-; CHECK-NEXT:    lsr w12, w12, #6
-; CHECK-NEXT:    lsr w11, w11, #6
-; CHECK-NEXT:    umull x8, w16, w8
-; CHECK-NEXT:    msub w10, w12, w13, w10
-; CHECK-NEXT:    lsr x12, x14, #32
-; CHECK-NEXT:    msub w9, w11, w13, w9
-; CHECK-NEXT:    sub w11, w15, w12
-; CHECK-NEXT:    lsr x8, x8, #32
-; CHECK-NEXT:    fmov s0, w10
-; CHECK-NEXT:    add w10, w12, w11, lsr #1
-; CHECK-NEXT:    lsr w10, w10, #6
-; CHECK-NEXT:    sub w11, w16, w8
-; CHECK-NEXT:    mov v0.h[1], w9
-; CHECK-NEXT:    msub w9, w10, w13, w15
-; CHECK-NEXT:    add w8, w8, w11, lsr #1
-; CHECK-NEXT:    lsr w8, w8, #6
-; CHECK-NEXT:    mov v0.h[2], w9
-; CHECK-NEXT:    msub w8, w8, w13, w16
+; CHECK-NEXT:    msub w8, w11, w12, w8
+; CHECK-NEXT:    lsr x11, x14, #32
+; CHECK-NEXT:    umull x14, w13, w9
+; CHECK-NEXT:    msub w10, w11, w12, w10
+; CHECK-NEXT:    umov w11, v0.h[3]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    lsr x8, x14, #32
+; CHECK-NEXT:    msub w8, w8, w12, w13
+; CHECK-NEXT:    mov v0.h[1], w10
+; CHECK-NEXT:    umull x9, w11, w9
+; CHECK-NEXT:    lsr x9, x9, #32
+; CHECK-NEXT:    mov v0.h[2], w8
+; CHECK-NEXT:    msub w8, w9, w12, w11
 ; CHECK-NEXT:    mov v0.h[3], w8
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
@@ -98,45 +82,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; CHECK-LABEL: combine_urem_udiv:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w9, v0.h[0]
-; CHECK-NEXT:    mov w8, #8969
-; CHECK-NEXT:    movk w8, #22765, lsl #16
+; CHECK-NEXT:    umov w8, v0.h[0]
+; CHECK-NEXT:    mov w9, #55879
+; CHECK-NEXT:    movk w9, #689, lsl #16
 ; CHECK-NEXT:    umov w10, v0.h[1]
-; CHECK-NEXT:    umov w11, v0.h[2]
-; CHECK-NEXT:    mov w15, #95
-; CHECK-NEXT:    umov w13, v0.h[3]
-; CHECK-NEXT:    umull x12, w9, w8
-; CHECK-NEXT:    umull x14, w10, w8
-; CHECK-NEXT:    lsr x12, x12, #32
-; CHECK-NEXT:    umull x17, w11, w8
-; CHECK-NEXT:    sub w16, w9, w12
-; CHECK-NEXT:    lsr x14, x14, #32
-; CHECK-NEXT:    lsr x17, x17, #32
-; CHECK-NEXT:    umull x8, w13, w8
-; CHECK-NEXT:    add w12, w12, w16, lsr #1
-; CHECK-NEXT:    sub w16, w10, w14
-; CHECK-NEXT:    lsr w12, w12, #6
+; CHECK-NEXT:    mov w12, #95
+; CHECK-NEXT:    umov w14, v0.h[2]
+; CHECK-NEXT:    umov w15, v0.h[3]
+; CHECK-NEXT:    umull x11, w8, w9
+; CHECK-NEXT:    umull x13, w10, w9
+; CHECK-NEXT:    lsr x11, x11, #32
+; CHECK-NEXT:    lsr x13, x13, #32
+; CHECK-NEXT:    msub w8, w11, w12, w8
+; CHECK-NEXT:    msub w10, w13, w12, w10
+; CHECK-NEXT:    fmov s1, w11
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    umull x8, w14, w9
+; CHECK-NEXT:    umull x9, w15, w9
 ; CHECK-NEXT:    lsr x8, x8, #32
-; CHECK-NEXT:    add w14, w14, w16, lsr #1
-; CHECK-NEXT:    sub w16, w11, w17
-; CHECK-NEXT:    msub w9, w12, w15, w9
-; CHECK-NEXT:    lsr w14, w14, #6
-; CHECK-NEXT:    add w16, w17, w16, lsr #1
-; CHECK-NEXT:    fmov s1, w12
-; CHECK-NEXT:    msub w10, w14, w15, w10
-; CHECK-NEXT:    sub w17, w13, w8
-; CHECK-NEXT:    fmov s0, w9
-; CHECK-NEXT:    lsr w9, w16, #6
-; CHECK-NEXT:    mov v1.h[1], w14
-; CHECK-NEXT:    add w8, w8, w17, lsr #1
-; CHECK-NEXT:    msub w11, w9, w15, w11
-; CHECK-NEXT:    lsr w8, w8, #6
 ; CHECK-NEXT:    mov v0.h[1], w10
-; CHECK-NEXT:    msub w10, w8, w15, w13
-; CHECK-NEXT:    mov v1.h[2], w9
-; CHECK-NEXT:    mov v0.h[2], w11
-; CHECK-NEXT:    mov v1.h[3], w8
-; CHECK-NEXT:    mov v0.h[3], w10
+; CHECK-NEXT:    lsr x9, x9, #32
+; CHECK-NEXT:    msub w10, w8, w12, w14
+; CHECK-NEXT:    mov v1.h[1], w13
+; CHECK-NEXT:    msub w11, w9, w12, w15
+; CHECK-NEXT:    mov v0.h[2], w10
+; CHECK-NEXT:    mov v1.h[2], w8
+; CHECK-NEXT:    mov v0.h[3], w11
+; CHECK-NEXT:    mov v1.h[3], w9
 ; CHECK-NEXT:    add v0.4h, v0.4h, v1.4h
 ; CHECK-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 95, i16 95, i16 95, i16 95>
@@ -151,25 +123,22 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; CHECK-LABEL: dont_fold_urem_power_of_two:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w10, v0.h[0]
-; CHECK-NEXT:    umov w9, v0.h[3]
-; CHECK-NEXT:    mov w8, #8969
+; CHECK-NEXT:    umov w9, v0.h[0]
 ; CHECK-NEXT:    umov w11, v0.h[1]
-; CHECK-NEXT:    movk w8, #22765, lsl #16
-; CHECK-NEXT:    and w10, w10, #0x3f
-; CHECK-NEXT:    umull x8, w9, w8
-; CHECK-NEXT:    and w11, w11, #0x1f
+; CHECK-NEXT:    umov w10, v0.h[3]
+; CHECK-NEXT:    mov w8, #55879
+; CHECK-NEXT:    movk w8, #689, lsl #16
+; CHECK-NEXT:    and w9, w9, #0x3f
+; CHECK-NEXT:    umull x8, w10, w8
+; CHECK-NEXT:    fmov s1, w9
+; CHECK-NEXT:    and w9, w11, #0x1f
+; CHECK-NEXT:    umov w11, v0.h[2]
 ; CHECK-NEXT:    lsr x8, x8, #32
-; CHECK-NEXT:    fmov s1, w10
-; CHECK-NEXT:    umov w10, v0.h[2]
-; CHECK-NEXT:    sub w12, w9, w8
-; CHECK-NEXT:    mov v1.h[1], w11
-; CHECK-NEXT:    add w8, w8, w12, lsr #1
-; CHECK-NEXT:    and w10, w10, #0x7
-; CHECK-NEXT:    lsr w8, w8, #6
-; CHECK-NEXT:    mov w11, #95
-; CHECK-NEXT:    msub w8, w8, w11, w9
-; CHECK-NEXT:    mov v1.h[2], w10
+; CHECK-NEXT:    mov v1.h[1], w9
+; CHECK-NEXT:    mov w9, #95
+; CHECK-NEXT:    and w11, w11, #0x7
+; CHECK-NEXT:    msub w8, w8, w9, w10
+; CHECK-NEXT:    mov v1.h[2], w11
 ; CHECK-NEXT:    mov v1.h[3], w8
 ; CHECK-NEXT:    fmov d0, d1
 ; CHECK-NEXT:    ret
@@ -182,29 +151,28 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; CHECK-LABEL: dont_fold_srem_one:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT:    umov w9, v0.h[1]
-; CHECK-NEXT:    mov w8, #30865
-; CHECK-NEXT:    movk w8, #51306, lsl #16
-; CHECK-NEXT:    umov w11, v0.h[2]
+; CHECK-NEXT:    umov w8, v0.h[1]
+; CHECK-NEXT:    mov w9, #13629
+; CHECK-NEXT:    movk w9, #100, lsl #16
+; CHECK-NEXT:    umov w10, v0.h[2]
+; CHECK-NEXT:    mov w11, #25645
 ; CHECK-NEXT:    mov w12, #654
+; CHECK-NEXT:    movk w11, #2849, lsl #16
 ; CHECK-NEXT:    movi d1, #0000000000000000
-; CHECK-NEXT:    mov w13, #47143
-; CHECK-NEXT:    ubfx w10, w9, #1, #15
-; CHECK-NEXT:    movk w13, #24749, lsl #16
-; CHECK-NEXT:    umull x8, w10, w8
-; CHECK-NEXT:    mov w10, #17097
-; CHECK-NEXT:    movk w10, #45590, lsl #16
-; CHECK-NEXT:    lsr x8, x8, #40
-; CHECK-NEXT:    umull x10, w11, w10
-; CHECK-NEXT:    msub w8, w8, w12, w9
+; CHECK-NEXT:    umull x9, w8, w9
+; CHECK-NEXT:    mov w13, #5560
+; CHECK-NEXT:    umull x11, w10, w11
+; CHECK-NEXT:    movk w13, #12, lsl #16
+; CHECK-NEXT:    lsr x9, x9, #32
+; CHECK-NEXT:    lsr x11, x11, #32
+; CHECK-NEXT:    msub w8, w9, w12, w8
 ; CHECK-NEXT:    umov w9, v0.h[3]
-; CHECK-NEXT:    lsr x10, x10, #36
 ; CHECK-NEXT:    mov w12, #23
-; CHECK-NEXT:    msub w10, w10, w12, w11
+; CHECK-NEXT:    msub w10, w11, w12, w10
 ; CHECK-NEXT:    mov w11, #5423
 ; CHECK-NEXT:    mov v1.h[1], w8
 ; CHECK-NEXT:    umull x8, w9, w13
-; CHECK-NEXT:    lsr x8, x8, #43
+; CHECK-NEXT:    lsr x8, x8, #32
 ; CHECK-NEXT:    mov v1.h[2], w10
 ; CHECK-NEXT:    msub w8, w8, w11, w9
 ; CHECK-NEXT:    mov v1.h[3], w8

diff  --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll
index 5d3be8db524b4..de1d464548a98 100644
--- a/llvm/test/CodeGen/AMDGPU/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/fshr.ll
@@ -1335,10 +1335,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
-; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
+; SI-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; SI-NEXT:    v_mul_hi_u32 v3, v3, s4
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
 ; SI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
 ; SI-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; SI-NEXT:    v_add_i32_e32 v2, vcc, 8, v2
@@ -1349,10 +1348,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
-; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
+; VI-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; VI-NEXT:    v_mul_hi_u32 v3, v3, s4
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
 ; VI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
 ; VI-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, 8, v2
@@ -1363,10 +1361,9 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
-; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
+; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; GFX9-NEXT:    v_mul_hi_u32 v3, v3, s4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
 ; GFX9-NEXT:    v_sub_u32_e32 v2, v2, v3
 ; GFX9-NEXT:    v_add_u32_e32 v2, 8, v2
@@ -1384,8 +1381,7 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX10-NEXT:    v_mul_hi_u32 v3, 0xaaaaaaab, v3
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
+; GFX10-NEXT:    v_mul_hi_u32 v3, 0xaaaaaab, v3
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v2, 8, v2
@@ -1399,13 +1395,12 @@ define i24 @v_fshr_i24(i24 %src0, i24 %src1, i24 %src2) {
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_hi_u32 v3, 0xaaaaaaab, v3
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_hi_u32 v3, 0xaaaaaab, v3
 ; GFX11-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v2, v2, v3
 ; GFX11-NEXT:    v_add_nc_u32_e32 v2, 8, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_alignbit_b32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %ret = call i24 @llvm.fshr.i24(i24 %src0, i24 %src1, i24 %src2)
@@ -1417,19 +1412,17 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
-; SI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
+; SI-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; SI-NEXT:    v_mul_hi_u32 v6, v6, s4
 ; SI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; SI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
 ; SI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; SI-NEXT:    v_sub_i32_e32 v4, vcc, v4, v6
 ; SI-NEXT:    v_mul_hi_u32 v6, v7, s4
 ; SI-NEXT:    v_add_i32_e32 v4, vcc, 8, v4
 ; SI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; SI-NEXT:    v_lshrrev_b32_e32 v3, 4, v6
-; SI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
+; SI-NEXT:    v_mul_u32_u24_e32 v3, 24, v6
 ; SI-NEXT:    v_sub_i32_e32 v3, vcc, v5, v3
 ; SI-NEXT:    v_add_i32_e32 v3, vcc, 8, v3
 ; SI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
@@ -1439,19 +1432,17 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
-; VI-NEXT:    s_mov_b32 s4, 0xaaaaaaab
+; VI-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; VI-NEXT:    v_mul_hi_u32 v6, v6, s4
 ; VI-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; VI-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
 ; VI-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; VI-NEXT:    v_sub_u32_e32 v4, vcc, v4, v6
 ; VI-NEXT:    v_mul_hi_u32 v6, v7, s4
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, 8, v4
 ; VI-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; VI-NEXT:    v_lshrrev_b32_e32 v3, 4, v6
-; VI-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
+; VI-NEXT:    v_mul_u32_u24_e32 v3, 24, v6
 ; VI-NEXT:    v_sub_u32_e32 v3, vcc, v5, v3
 ; VI-NEXT:    v_add_u32_e32 v3, vcc, 8, v3
 ; VI-NEXT:    v_alignbit_b32 v1, v1, v2, v3
@@ -1461,19 +1452,17 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v4
-; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaaab
+; GFX9-NEXT:    s_mov_b32 s4, 0xaaaaaab
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v6, s4
 ; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v7, s4
 ; GFX9-NEXT:    v_add_u32_e32 v4, 8, v4
 ; GFX9-NEXT:    v_alignbit_b32 v0, v0, v2, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 4, v6
-; GFX9-NEXT:    v_mul_u32_u24_e32 v3, 24, v3
+; GFX9-NEXT:    v_mul_u32_u24_e32 v3, 24, v6
 ; GFX9-NEXT:    v_sub_u32_e32 v3, v5, v3
 ; GFX9-NEXT:    v_add_u32_e32 v3, 8, v3
 ; GFX9-NEXT:    v_alignbit_b32 v1, v1, v2, v3
@@ -1492,10 +1481,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX10-NEXT:    v_and_b32_e32 v7, 0xffffff, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
-; GFX10-NEXT:    v_mul_hi_u32 v6, 0xaaaaaaab, v6
-; GFX10-NEXT:    v_mul_hi_u32 v7, 0xaaaaaaab, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
+; GFX10-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v6
+; GFX10-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v7
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; GFX10-NEXT:    v_mul_u32_u24_e32 v7, 24, v7
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v4, v6
@@ -1515,11 +1502,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_mul_hi_u32 v6, 0xaaaaaaab, v6
-; GFX11-NEXT:    v_mul_hi_u32 v7, 0xaaaaaaab, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 4, v6
-; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 4, v7
+; GFX11-NEXT:    v_mul_hi_u32 v6, 0xaaaaaab, v6
+; GFX11-NEXT:    v_mul_hi_u32 v7, 0xaaaaaab, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_mul_u32_u24_e32 v6, 24, v6
 ; GFX11-NEXT:    v_mul_u32_u24_e32 v7, 24, v7

diff  --git a/llvm/test/CodeGen/ARM/select-imm.ll b/llvm/test/CodeGen/ARM/select-imm.ll
index 1b88cbeeeb6ad..fcfbc318ed2a7 100644
--- a/llvm/test/CodeGen/ARM/select-imm.ll
+++ b/llvm/test/CodeGen/ARM/select-imm.ll
@@ -701,16 +701,15 @@ define i1 @t11() {
 ; ARMT2-NEXT:    sub sp, sp, #4
 ; ARMT2-NEXT:    ldr r1, [sp]
 ; ARMT2-NEXT:    mov r0, #33
-; ARMT2-NEXT:    movw r2, #52429
-; ARMT2-NEXT:    movt r2, #52428
+; ARMT2-NEXT:    movw r2, #39322
+; ARMT2-NEXT:    movt r2, #6553
 ; ARMT2-NEXT:    bfi r1, r0, #0, #12
 ; ARMT2-NEXT:    mov r0, #10
 ; ARMT2-NEXT:    bfi r1, r0, #12, #13
 ; ARMT2-NEXT:    mov r0, r1
 ; ARMT2-NEXT:    bfc r0, #12, #20
 ; ARMT2-NEXT:    umull r2, r3, r0, r2
-; ARMT2-NEXT:    lsr r2, r3, #3
-; ARMT2-NEXT:    add r2, r2, r2, lsl #2
+; ARMT2-NEXT:    add r2, r3, r3, lsl #2
 ; ARMT2-NEXT:    sub r0, r0, r2, lsl #1
 ; ARMT2-NEXT:    movw r2, #40960
 ; ARMT2-NEXT:    movt r2, #65024
@@ -764,16 +763,15 @@ define i1 @t11() {
 ; THUMB2-NEXT:    sub sp, #4
 ; THUMB2-NEXT:    ldr r1, [sp]
 ; THUMB2-NEXT:    movs r0, #33
-; THUMB2-NEXT:    movw r2, #52429
+; THUMB2-NEXT:    movw r2, #39322
 ; THUMB2-NEXT:    bfi r1, r0, #0, #12
 ; THUMB2-NEXT:    movs r0, #10
 ; THUMB2-NEXT:    bfi r1, r0, #12, #13
 ; THUMB2-NEXT:    mov r0, r1
-; THUMB2-NEXT:    movt r2, #52428
+; THUMB2-NEXT:    movt r2, #6553
 ; THUMB2-NEXT:    bfc r0, #12, #20
 ; THUMB2-NEXT:    umull r2, r3, r0, r2
-; THUMB2-NEXT:    lsrs r2, r3, #3
-; THUMB2-NEXT:    add.w r2, r2, r2, lsl #2
+; THUMB2-NEXT:    add.w r2, r3, r3, lsl #2
 ; THUMB2-NEXT:    sub.w r0, r0, r2, lsl #1
 ; THUMB2-NEXT:    movw r2, #40960
 ; THUMB2-NEXT:    movt r2, #65024

diff  --git a/llvm/test/CodeGen/PowerPC/funnel-shift.ll b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
index 128165aef4eaf..24fe892a5a5e4 100644
--- a/llvm/test/CodeGen/PowerPC/funnel-shift.ll
+++ b/llvm/test/CodeGen/PowerPC/funnel-shift.ll
@@ -352,15 +352,14 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) {
 ;
 ; CHECK64-LABEL: fshl_i37:
 ; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    lis 6, 28339
+; CHECK64-NEXT:    lis 6, 1771
 ; CHECK64-NEXT:    clrldi 7, 5, 27
-; CHECK64-NEXT:    ori 6, 6, 58451
+; CHECK64-NEXT:    ori 6, 6, 15941
 ; CHECK64-NEXT:    sldi 4, 4, 27
-; CHECK64-NEXT:    rldic 6, 6, 33, 0
-; CHECK64-NEXT:    oris 6, 6, 3542
-; CHECK64-NEXT:    ori 6, 6, 31883
+; CHECK64-NEXT:    rldic 6, 6, 32, 5
+; CHECK64-NEXT:    oris 6, 6, 12398
+; CHECK64-NEXT:    ori 6, 6, 46053
 ; CHECK64-NEXT:    mulhdu 6, 7, 6
-; CHECK64-NEXT:    rldicl 6, 6, 59, 5
 ; CHECK64-NEXT:    mulli 6, 6, 37
 ; CHECK64-NEXT:    sub 5, 5, 6
 ; CHECK64-NEXT:    clrlwi 5, 5, 26
@@ -649,15 +648,14 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) {
 ;
 ; CHECK64-LABEL: fshr_i37:
 ; CHECK64:       # %bb.0:
-; CHECK64-NEXT:    lis 6, 28339
+; CHECK64-NEXT:    lis 6, 1771
 ; CHECK64-NEXT:    clrldi 7, 5, 27
-; CHECK64-NEXT:    ori 6, 6, 58451
+; CHECK64-NEXT:    ori 6, 6, 15941
 ; CHECK64-NEXT:    sldi 4, 4, 27
-; CHECK64-NEXT:    rldic 6, 6, 33, 0
-; CHECK64-NEXT:    oris 6, 6, 3542
-; CHECK64-NEXT:    ori 6, 6, 31883
+; CHECK64-NEXT:    rldic 6, 6, 32, 5
+; CHECK64-NEXT:    oris 6, 6, 12398
+; CHECK64-NEXT:    ori 6, 6, 46053
 ; CHECK64-NEXT:    mulhdu 6, 7, 6
-; CHECK64-NEXT:    rldicl 6, 6, 59, 5
 ; CHECK64-NEXT:    mulli 6, 6, 37
 ; CHECK64-NEXT:    sub 5, 5, 6
 ; CHECK64-NEXT:    addi 5, 5, 27

diff  --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
index 0dd9fc6bd5b1b..b4cdf2844a731 100644
--- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
@@ -11,102 +11,86 @@
 define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; P9LE-LABEL: fold_urem_vec_1:
 ; P9LE:       # %bb.0:
-; P9LE-NEXT:    li r3, 4
-; P9LE-NEXT:    lis r4, 21399
-; P9LE-NEXT:    lis r5, 8456
+; P9LE-NEXT:    li r3, 0
+; P9LE-NEXT:    lis r4, 689
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    ori r4, r4, 33437
-; P9LE-NEXT:    ori r5, r5, 16913
+; P9LE-NEXT:    ori r4, r4, 55879
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r4, r3, r4
-; P9LE-NEXT:    srwi r4, r4, 5
-; P9LE-NEXT:    mulli r4, r4, 98
+; P9LE-NEXT:    mulli r4, r4, 95
 ; P9LE-NEXT:    sub r3, r3, r4
-; P9LE-NEXT:    lis r4, 16727
+; P9LE-NEXT:    lis r4, 528
 ; P9LE-NEXT:    mtvsrd v3, r3
-; P9LE-NEXT:    li r3, 6
-; P9LE-NEXT:    ori r4, r4, 2287
+; P9LE-NEXT:    li r3, 2
+; P9LE-NEXT:    ori r4, r4, 33826
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r4, r3, r4
-; P9LE-NEXT:    srwi r4, r4, 8
-; P9LE-NEXT:    mulli r4, r4, 1003
+; P9LE-NEXT:    mulli r4, r4, 124
 ; P9LE-NEXT:    sub r3, r3, r4
+; P9LE-NEXT:    lis r4, 668
 ; P9LE-NEXT:    mtvsrd v4, r3
-; P9LE-NEXT:    li r3, 2
+; P9LE-NEXT:    li r3, 4
+; P9LE-NEXT:    ori r4, r4, 48149
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    vmrghh v3, v4, v3
-; P9LE-NEXT:    clrlwi r4, r3, 16
-; P9LE-NEXT:    rlwinm r3, r3, 30, 18, 31
-; P9LE-NEXT:    mulhwu r3, r3, r5
-; P9LE-NEXT:    srwi r3, r3, 2
-; P9LE-NEXT:    mulli r3, r3, 124
-; P9LE-NEXT:    sub r3, r4, r3
-; P9LE-NEXT:    lis r4, 22765
+; P9LE-NEXT:    clrlwi r3, r3, 16
+; P9LE-NEXT:    mulhwu r4, r3, r4
+; P9LE-NEXT:    mulli r4, r4, 98
+; P9LE-NEXT:    sub r3, r3, r4
+; P9LE-NEXT:    lis r4, 65
 ; P9LE-NEXT:    mtvsrd v4, r3
-; P9LE-NEXT:    li r3, 0
-; P9LE-NEXT:    ori r4, r4, 8969
+; P9LE-NEXT:    li r3, 6
+; P9LE-NEXT:    ori r4, r4, 22281
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r4, r3, r4
-; P9LE-NEXT:    sub r5, r3, r4
-; P9LE-NEXT:    srwi r5, r5, 1
-; P9LE-NEXT:    add r4, r5, r4
-; P9LE-NEXT:    srwi r4, r4, 6
-; P9LE-NEXT:    mulli r4, r4, 95
+; P9LE-NEXT:    mulli r4, r4, 1003
 ; P9LE-NEXT:    sub r3, r3, r4
 ; P9LE-NEXT:    mtvsrd v2, r3
-; P9LE-NEXT:    vmrghh v2, v4, v2
-; P9LE-NEXT:    xxmrglw v2, v3, v2
+; P9LE-NEXT:    vmrghh v2, v2, v4
+; P9LE-NEXT:    xxmrglw v2, v2, v3
 ; P9LE-NEXT:    blr
 ;
 ; P9BE-LABEL: fold_urem_vec_1:
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 6
-; P9BE-NEXT:    lis r4, 16727
-; P9BE-NEXT:    lis r5, 8456
+; P9BE-NEXT:    lis r4, 65
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    ori r4, r4, 2287
-; P9BE-NEXT:    ori r5, r5, 16913
+; P9BE-NEXT:    ori r4, r4, 22281
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    mulhwu r4, r3, r4
-; P9BE-NEXT:    srwi r4, r4, 8
 ; P9BE-NEXT:    mulli r4, r4, 1003
 ; P9BE-NEXT:    sub r3, r3, r4
-; P9BE-NEXT:    lis r4, 21399
+; P9BE-NEXT:    lis r4, 668
 ; P9BE-NEXT:    mtfprwz f0, r3
 ; P9BE-NEXT:    li r3, 4
-; P9BE-NEXT:    ori r4, r4, 33437
+; P9BE-NEXT:    ori r4, r4, 48149
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    mulhwu r4, r3, r4
-; P9BE-NEXT:    srwi r4, r4, 5
 ; P9BE-NEXT:    mulli r4, r4, 98
 ; P9BE-NEXT:    sub r3, r3, r4
+; P9BE-NEXT:    lis r4, 528
 ; P9BE-NEXT:    mtfprwz f1, r3
 ; P9BE-NEXT:    addis r3, r2, .LCPI0_0 at toc@ha
+; P9BE-NEXT:    ori r4, r4, 33826
 ; P9BE-NEXT:    addi r3, r3, .LCPI0_0 at toc@l
 ; P9BE-NEXT:    lxv vs2, 0(r3)
 ; P9BE-NEXT:    li r3, 2
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    clrlwi r4, r3, 16
-; P9BE-NEXT:    rlwinm r3, r3, 30, 18, 31
+; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    xxperm vs0, vs1, vs2
-; P9BE-NEXT:    mulhwu r3, r3, r5
-; P9BE-NEXT:    srwi r3, r3, 2
-; P9BE-NEXT:    mulli r3, r3, 124
-; P9BE-NEXT:    sub r3, r4, r3
-; P9BE-NEXT:    lis r4, 22765
+; P9BE-NEXT:    mulhwu r4, r3, r4
+; P9BE-NEXT:    mulli r4, r4, 124
+; P9BE-NEXT:    sub r3, r3, r4
+; P9BE-NEXT:    lis r4, 689
 ; P9BE-NEXT:    mtfprwz f1, r3
 ; P9BE-NEXT:    li r3, 0
-; P9BE-NEXT:    ori r4, r4, 8969
+; P9BE-NEXT:    ori r4, r4, 55879
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    mulhwu r4, r3, r4
-; P9BE-NEXT:    sub r5, r3, r4
-; P9BE-NEXT:    srwi r5, r5, 1
-; P9BE-NEXT:    add r4, r5, r4
-; P9BE-NEXT:    srwi r4, r4, 6
 ; P9BE-NEXT:    mulli r4, r4, 95
 ; P9BE-NEXT:    sub r3, r3, r4
 ; P9BE-NEXT:    mtfprwz f3, r3
@@ -117,100 +101,84 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; P8LE-LABEL: fold_urem_vec_1:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r3, 22765
-; P8LE-NEXT:    lis r7, 21399
-; P8LE-NEXT:    lis r9, 16727
-; P8LE-NEXT:    lis r10, 8456
-; P8LE-NEXT:    ori r3, r3, 8969
-; P8LE-NEXT:    ori r7, r7, 33437
-; P8LE-NEXT:    ori r9, r9, 2287
-; P8LE-NEXT:    ori r10, r10, 16913
+; P8LE-NEXT:    lis r3, 689
+; P8LE-NEXT:    lis r8, 528
+; P8LE-NEXT:    lis r9, 668
+; P8LE-NEXT:    lis r10, 65
+; P8LE-NEXT:    ori r3, r3, 55879
+; P8LE-NEXT:    ori r8, r8, 33826
+; P8LE-NEXT:    ori r9, r9, 48149
+; P8LE-NEXT:    ori r10, r10, 22281
 ; P8LE-NEXT:    mffprd r4, f0
-; P8LE-NEXT:    clrldi r6, r4, 48
-; P8LE-NEXT:    rldicl r5, r4, 32, 48
-; P8LE-NEXT:    clrlwi r6, r6, 16
-; P8LE-NEXT:    rldicl r8, r4, 16, 48
+; P8LE-NEXT:    clrldi r5, r4, 48
+; P8LE-NEXT:    rldicl r6, r4, 48, 48
+; P8LE-NEXT:    rldicl r7, r4, 32, 48
+; P8LE-NEXT:    rldicl r4, r4, 16, 48
 ; P8LE-NEXT:    clrlwi r5, r5, 16
-; P8LE-NEXT:    mulhwu r3, r6, r3
-; P8LE-NEXT:    rldicl r4, r4, 48, 48
-; P8LE-NEXT:    clrlwi r8, r8, 16
-; P8LE-NEXT:    rlwinm r11, r4, 30, 18, 31
-; P8LE-NEXT:    mulhwu r7, r5, r7
+; P8LE-NEXT:    clrlwi r6, r6, 16
+; P8LE-NEXT:    mulhwu r3, r5, r3
+; P8LE-NEXT:    clrlwi r7, r7, 16
 ; P8LE-NEXT:    clrlwi r4, r4, 16
-; P8LE-NEXT:    mulhwu r9, r8, r9
-; P8LE-NEXT:    mulhwu r10, r11, r10
-; P8LE-NEXT:    sub r11, r6, r3
-; P8LE-NEXT:    srwi r11, r11, 1
-; P8LE-NEXT:    srwi r7, r7, 5
-; P8LE-NEXT:    add r3, r11, r3
-; P8LE-NEXT:    srwi r9, r9, 8
-; P8LE-NEXT:    srwi r10, r10, 2
-; P8LE-NEXT:    srwi r3, r3, 6
-; P8LE-NEXT:    mulli r7, r7, 98
-; P8LE-NEXT:    mulli r9, r9, 1003
+; P8LE-NEXT:    mulhwu r8, r6, r8
+; P8LE-NEXT:    mulhwu r9, r7, r9
+; P8LE-NEXT:    mulhwu r10, r4, r10
 ; P8LE-NEXT:    mulli r3, r3, 95
-; P8LE-NEXT:    mulli r10, r10, 124
-; P8LE-NEXT:    sub r5, r5, r7
-; P8LE-NEXT:    sub r7, r8, r9
-; P8LE-NEXT:    sub r3, r6, r3
-; P8LE-NEXT:    mtvsrd v2, r5
+; P8LE-NEXT:    mulli r8, r8, 124
+; P8LE-NEXT:    mulli r9, r9, 98
+; P8LE-NEXT:    mulli r10, r10, 1003
+; P8LE-NEXT:    sub r3, r5, r3
+; P8LE-NEXT:    sub r5, r6, r8
+; P8LE-NEXT:    mtvsrd v2, r3
+; P8LE-NEXT:    sub r3, r7, r9
 ; P8LE-NEXT:    sub r4, r4, r10
-; P8LE-NEXT:    mtvsrd v3, r7
+; P8LE-NEXT:    mtvsrd v3, r5
 ; P8LE-NEXT:    mtvsrd v4, r3
 ; P8LE-NEXT:    mtvsrd v5, r4
 ; P8LE-NEXT:    vmrghh v2, v3, v2
 ; P8LE-NEXT:    vmrghh v3, v5, v4
-; P8LE-NEXT:    xxmrglw v2, v2, v3
+; P8LE-NEXT:    xxmrglw v2, v3, v2
 ; P8LE-NEXT:    blr
 ;
 ; P8BE-LABEL: fold_urem_vec_1:
 ; P8BE:       # %bb.0:
 ; P8BE-NEXT:    mfvsrd r4, v2
-; P8BE-NEXT:    lis r3, 22765
-; P8BE-NEXT:    lis r7, 16727
-; P8BE-NEXT:    lis r9, 21399
-; P8BE-NEXT:    lis r10, 8456
-; P8BE-NEXT:    ori r3, r3, 8969
-; P8BE-NEXT:    ori r7, r7, 2287
-; P8BE-NEXT:    ori r9, r9, 33437
-; P8BE-NEXT:    ori r10, r10, 16913
-; P8BE-NEXT:    rldicl r6, r4, 16, 48
+; P8BE-NEXT:    lis r3, 65
+; P8BE-NEXT:    lis r8, 668
+; P8BE-NEXT:    lis r9, 528
+; P8BE-NEXT:    lis r10, 689
+; P8BE-NEXT:    ori r3, r3, 22281
+; P8BE-NEXT:    ori r8, r8, 48149
+; P8BE-NEXT:    ori r9, r9, 33826
+; P8BE-NEXT:    ori r10, r10, 55879
 ; P8BE-NEXT:    clrldi r5, r4, 48
-; P8BE-NEXT:    clrlwi r6, r6, 16
+; P8BE-NEXT:    rldicl r6, r4, 48, 48
 ; P8BE-NEXT:    clrlwi r5, r5, 16
-; P8BE-NEXT:    mulhwu r3, r6, r3
-; P8BE-NEXT:    rldicl r8, r4, 48, 48
-; P8BE-NEXT:    mulhwu r7, r5, r7
-; P8BE-NEXT:    rldicl r4, r4, 32, 48
-; P8BE-NEXT:    clrlwi r8, r8, 16
-; P8BE-NEXT:    rlwinm r11, r4, 30, 18, 31
-; P8BE-NEXT:    mulhwu r9, r8, r9
+; P8BE-NEXT:    rldicl r7, r4, 32, 48
+; P8BE-NEXT:    clrlwi r6, r6, 16
+; P8BE-NEXT:    rldicl r4, r4, 16, 48
+; P8BE-NEXT:    mulhwu r3, r5, r3
+; P8BE-NEXT:    clrlwi r7, r7, 16
 ; P8BE-NEXT:    clrlwi r4, r4, 16
-; P8BE-NEXT:    mulhwu r10, r11, r10
-; P8BE-NEXT:    sub r11, r6, r3
-; P8BE-NEXT:    srwi r7, r7, 8
-; P8BE-NEXT:    srwi r11, r11, 1
-; P8BE-NEXT:    add r3, r11, r3
-; P8BE-NEXT:    mulli r7, r7, 1003
-; P8BE-NEXT:    srwi r9, r9, 5
-; P8BE-NEXT:    srwi r3, r3, 6
-; P8BE-NEXT:    srwi r10, r10, 2
-; P8BE-NEXT:    mulli r9, r9, 98
-; P8BE-NEXT:    mulli r3, r3, 95
-; P8BE-NEXT:    mulli r10, r10, 124
-; P8BE-NEXT:    sub r5, r5, r7
-; P8BE-NEXT:    addis r7, r2, .LCPI0_0 at toc@ha
-; P8BE-NEXT:    mtvsrwz v2, r5
-; P8BE-NEXT:    addi r5, r7, .LCPI0_0 at toc@l
-; P8BE-NEXT:    sub r8, r8, r9
-; P8BE-NEXT:    lxvw4x v3, 0, r5
-; P8BE-NEXT:    sub r3, r6, r3
+; P8BE-NEXT:    mulhwu r8, r6, r8
+; P8BE-NEXT:    mulhwu r9, r7, r9
+; P8BE-NEXT:    mulhwu r10, r4, r10
+; P8BE-NEXT:    mulli r3, r3, 1003
+; P8BE-NEXT:    mulli r8, r8, 98
+; P8BE-NEXT:    mulli r9, r9, 124
+; P8BE-NEXT:    mulli r10, r10, 95
+; P8BE-NEXT:    sub r3, r5, r3
+; P8BE-NEXT:    addis r5, r2, .LCPI0_0 at toc@ha
+; P8BE-NEXT:    mtvsrwz v2, r3
+; P8BE-NEXT:    addi r3, r5, .LCPI0_0 at toc@l
+; P8BE-NEXT:    sub r6, r6, r8
+; P8BE-NEXT:    lxvw4x v3, 0, r3
+; P8BE-NEXT:    sub r3, r7, r9
 ; P8BE-NEXT:    sub r4, r4, r10
-; P8BE-NEXT:    mtvsrwz v4, r8
+; P8BE-NEXT:    mtvsrwz v4, r6
 ; P8BE-NEXT:    mtvsrwz v5, r3
 ; P8BE-NEXT:    mtvsrwz v0, r4
 ; P8BE-NEXT:    vperm v2, v4, v2, v3
-; P8BE-NEXT:    vperm v3, v5, v0, v3
+; P8BE-NEXT:    vperm v3, v0, v5, v3
 ; P8BE-NEXT:    xxmrghw v2, v3, v2
 ; P8BE-NEXT:    blr
   %1 = urem <4 x i16> %x, <i16 95, i16 124, i16 98, i16 1003>
@@ -221,15 +189,11 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9LE-LABEL: fold_urem_vec_2:
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 0
-; P9LE-NEXT:    lis r4, 22765
+; P9LE-NEXT:    lis r4, 689
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    ori r4, r4, 8969
+; P9LE-NEXT:    ori r4, r4, 55879
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r5, r3, r4
-; P9LE-NEXT:    sub r6, r3, r5
-; P9LE-NEXT:    srwi r6, r6, 1
-; P9LE-NEXT:    add r5, r6, r5
-; P9LE-NEXT:    srwi r5, r5, 6
 ; P9LE-NEXT:    mulli r5, r5, 95
 ; P9LE-NEXT:    sub r3, r3, r5
 ; P9LE-NEXT:    mtvsrd v3, r3
@@ -237,10 +201,6 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r5, r3, r4
-; P9LE-NEXT:    sub r6, r3, r5
-; P9LE-NEXT:    srwi r6, r6, 1
-; P9LE-NEXT:    add r5, r6, r5
-; P9LE-NEXT:    srwi r5, r5, 6
 ; P9LE-NEXT:    mulli r5, r5, 95
 ; P9LE-NEXT:    sub r3, r3, r5
 ; P9LE-NEXT:    mtvsrd v4, r3
@@ -249,10 +209,6 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9LE-NEXT:    vmrghh v3, v4, v3
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r5, r3, r4
-; P9LE-NEXT:    sub r6, r3, r5
-; P9LE-NEXT:    srwi r6, r6, 1
-; P9LE-NEXT:    add r5, r6, r5
-; P9LE-NEXT:    srwi r5, r5, 6
 ; P9LE-NEXT:    mulli r5, r5, 95
 ; P9LE-NEXT:    sub r3, r3, r5
 ; P9LE-NEXT:    mtvsrd v4, r3
@@ -260,10 +216,6 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r4, r3, r4
-; P9LE-NEXT:    sub r5, r3, r4
-; P9LE-NEXT:    srwi r5, r5, 1
-; P9LE-NEXT:    add r4, r5, r4
-; P9LE-NEXT:    srwi r4, r4, 6
 ; P9LE-NEXT:    mulli r4, r4, 95
 ; P9LE-NEXT:    sub r3, r3, r4
 ; P9LE-NEXT:    mtvsrd v2, r3
@@ -274,15 +226,11 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9BE-LABEL: fold_urem_vec_2:
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 6
-; P9BE-NEXT:    lis r4, 22765
+; P9BE-NEXT:    lis r4, 689
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    ori r4, r4, 8969
+; P9BE-NEXT:    ori r4, r4, 55879
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    mulhwu r5, r3, r4
-; P9BE-NEXT:    sub r6, r3, r5
-; P9BE-NEXT:    srwi r6, r6, 1
-; P9BE-NEXT:    add r5, r6, r5
-; P9BE-NEXT:    srwi r5, r5, 6
 ; P9BE-NEXT:    mulli r5, r5, 95
 ; P9BE-NEXT:    sub r3, r3, r5
 ; P9BE-NEXT:    mtfprwz f0, r3
@@ -290,10 +238,6 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    mulhwu r5, r3, r4
-; P9BE-NEXT:    sub r6, r3, r5
-; P9BE-NEXT:    srwi r6, r6, 1
-; P9BE-NEXT:    add r5, r6, r5
-; P9BE-NEXT:    srwi r5, r5, 6
 ; P9BE-NEXT:    mulli r5, r5, 95
 ; P9BE-NEXT:    sub r3, r3, r5
 ; P9BE-NEXT:    mtfprwz f1, r3
@@ -305,10 +249,6 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    xxperm vs0, vs1, vs2
 ; P9BE-NEXT:    mulhwu r5, r3, r4
-; P9BE-NEXT:    sub r6, r3, r5
-; P9BE-NEXT:    srwi r6, r6, 1
-; P9BE-NEXT:    add r5, r6, r5
-; P9BE-NEXT:    srwi r5, r5, 6
 ; P9BE-NEXT:    mulli r5, r5, 95
 ; P9BE-NEXT:    sub r3, r3, r5
 ; P9BE-NEXT:    mtfprwz f1, r3
@@ -316,10 +256,6 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    mulhwu r4, r3, r4
-; P9BE-NEXT:    sub r5, r3, r4
-; P9BE-NEXT:    srwi r5, r5, 1
-; P9BE-NEXT:    add r4, r5, r4
-; P9BE-NEXT:    srwi r4, r4, 6
 ; P9BE-NEXT:    mulli r4, r4, 95
 ; P9BE-NEXT:    sub r3, r3, r4
 ; P9BE-NEXT:    mtfprwz f3, r3
@@ -330,38 +266,22 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P8LE-LABEL: fold_urem_vec_2:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r3, 22765
-; P8LE-NEXT:    ori r3, r3, 8969
+; P8LE-NEXT:    lis r3, 689
+; P8LE-NEXT:    ori r3, r3, 55879
 ; P8LE-NEXT:    mffprd r4, f0
 ; P8LE-NEXT:    clrldi r5, r4, 48
 ; P8LE-NEXT:    rldicl r6, r4, 48, 48
-; P8LE-NEXT:    clrlwi r5, r5, 16
 ; P8LE-NEXT:    rldicl r7, r4, 32, 48
+; P8LE-NEXT:    rldicl r4, r4, 16, 48
+; P8LE-NEXT:    clrlwi r5, r5, 16
 ; P8LE-NEXT:    clrlwi r6, r6, 16
 ; P8LE-NEXT:    mulhwu r8, r5, r3
-; P8LE-NEXT:    rldicl r4, r4, 16, 48
 ; P8LE-NEXT:    clrlwi r7, r7, 16
-; P8LE-NEXT:    mulhwu r9, r6, r3
 ; P8LE-NEXT:    clrlwi r4, r4, 16
+; P8LE-NEXT:    mulhwu r9, r6, r3
 ; P8LE-NEXT:    mulhwu r10, r7, r3
 ; P8LE-NEXT:    mulhwu r3, r4, r3
-; P8LE-NEXT:    sub r11, r5, r8
-; P8LE-NEXT:    sub r12, r6, r9
-; P8LE-NEXT:    srwi r11, r11, 1
-; P8LE-NEXT:    add r8, r11, r8
-; P8LE-NEXT:    sub r11, r7, r10
-; P8LE-NEXT:    srwi r12, r12, 1
-; P8LE-NEXT:    add r9, r12, r9
-; P8LE-NEXT:    sub r12, r4, r3
-; P8LE-NEXT:    srwi r11, r11, 1
-; P8LE-NEXT:    srwi r8, r8, 6
-; P8LE-NEXT:    add r10, r11, r10
-; P8LE-NEXT:    srwi r11, r12, 1
-; P8LE-NEXT:    srwi r9, r9, 6
-; P8LE-NEXT:    add r3, r11, r3
 ; P8LE-NEXT:    mulli r8, r8, 95
-; P8LE-NEXT:    srwi r10, r10, 6
-; P8LE-NEXT:    srwi r3, r3, 6
 ; P8LE-NEXT:    mulli r9, r9, 95
 ; P8LE-NEXT:    mulli r10, r10, 95
 ; P8LE-NEXT:    mulli r3, r3, 95
@@ -381,37 +301,21 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P8BE-LABEL: fold_urem_vec_2:
 ; P8BE:       # %bb.0:
 ; P8BE-NEXT:    mfvsrd r4, v2
-; P8BE-NEXT:    lis r3, 22765
-; P8BE-NEXT:    ori r3, r3, 8969
+; P8BE-NEXT:    lis r3, 689
+; P8BE-NEXT:    ori r3, r3, 55879
 ; P8BE-NEXT:    clrldi r5, r4, 48
 ; P8BE-NEXT:    rldicl r6, r4, 48, 48
 ; P8BE-NEXT:    clrlwi r5, r5, 16
 ; P8BE-NEXT:    rldicl r7, r4, 32, 48
 ; P8BE-NEXT:    clrlwi r6, r6, 16
-; P8BE-NEXT:    mulhwu r8, r5, r3
 ; P8BE-NEXT:    rldicl r4, r4, 16, 48
+; P8BE-NEXT:    mulhwu r8, r5, r3
 ; P8BE-NEXT:    clrlwi r7, r7, 16
-; P8BE-NEXT:    mulhwu r9, r6, r3
 ; P8BE-NEXT:    clrlwi r4, r4, 16
+; P8BE-NEXT:    mulhwu r9, r6, r3
 ; P8BE-NEXT:    mulhwu r10, r7, r3
 ; P8BE-NEXT:    mulhwu r3, r4, r3
-; P8BE-NEXT:    sub r11, r5, r8
-; P8BE-NEXT:    sub r12, r6, r9
-; P8BE-NEXT:    srwi r11, r11, 1
-; P8BE-NEXT:    add r8, r11, r8
-; P8BE-NEXT:    sub r11, r7, r10
-; P8BE-NEXT:    srwi r12, r12, 1
-; P8BE-NEXT:    add r9, r12, r9
-; P8BE-NEXT:    sub r12, r4, r3
-; P8BE-NEXT:    srwi r11, r11, 1
-; P8BE-NEXT:    srwi r8, r8, 6
-; P8BE-NEXT:    add r10, r11, r10
-; P8BE-NEXT:    srwi r11, r12, 1
-; P8BE-NEXT:    srwi r9, r9, 6
 ; P8BE-NEXT:    mulli r8, r8, 95
-; P8BE-NEXT:    add r3, r11, r3
-; P8BE-NEXT:    srwi r10, r10, 6
-; P8BE-NEXT:    srwi r3, r3, 6
 ; P8BE-NEXT:    mulli r9, r9, 95
 ; P8BE-NEXT:    mulli r10, r10, 95
 ; P8BE-NEXT:    mulli r3, r3, 95
@@ -440,26 +344,18 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9LE-LABEL: combine_urem_udiv:
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 0
-; P9LE-NEXT:    lis r4, 22765
+; P9LE-NEXT:    lis r4, 689
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    ori r4, r4, 8969
+; P9LE-NEXT:    ori r4, r4, 55879
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r5, r3, r4
-; P9LE-NEXT:    sub r6, r3, r5
-; P9LE-NEXT:    srwi r6, r6, 1
-; P9LE-NEXT:    add r5, r6, r5
-; P9LE-NEXT:    srwi r5, r5, 6
 ; P9LE-NEXT:    mulli r6, r5, 95
 ; P9LE-NEXT:    sub r3, r3, r6
 ; P9LE-NEXT:    mtvsrd v3, r3
 ; P9LE-NEXT:    li r3, 2
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r6, r3, 16
-; P9LE-NEXT:    mulhwu r7, r6, r4
-; P9LE-NEXT:    sub r6, r6, r7
-; P9LE-NEXT:    srwi r6, r6, 1
-; P9LE-NEXT:    add r6, r6, r7
-; P9LE-NEXT:    srwi r6, r6, 6
+; P9LE-NEXT:    mulhwu r6, r6, r4
 ; P9LE-NEXT:    mulli r7, r6, 95
 ; P9LE-NEXT:    sub r3, r3, r7
 ; P9LE-NEXT:    mtvsrd v4, r3
@@ -467,11 +363,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    vmrghh v3, v4, v3
 ; P9LE-NEXT:    clrlwi r7, r3, 16
-; P9LE-NEXT:    mulhwu r8, r7, r4
-; P9LE-NEXT:    sub r7, r7, r8
-; P9LE-NEXT:    srwi r7, r7, 1
-; P9LE-NEXT:    add r7, r7, r8
-; P9LE-NEXT:    srwi r7, r7, 6
+; P9LE-NEXT:    mulhwu r7, r7, r4
 ; P9LE-NEXT:    mulli r8, r7, 95
 ; P9LE-NEXT:    sub r3, r3, r8
 ; P9LE-NEXT:    mtvsrd v4, r3
@@ -479,10 +371,6 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r8, r3, 16
 ; P9LE-NEXT:    mulhwu r4, r8, r4
-; P9LE-NEXT:    sub r8, r8, r4
-; P9LE-NEXT:    srwi r8, r8, 1
-; P9LE-NEXT:    add r4, r8, r4
-; P9LE-NEXT:    srwi r4, r4, 6
 ; P9LE-NEXT:    mulli r8, r4, 95
 ; P9LE-NEXT:    mtvsrd v5, r4
 ; P9LE-NEXT:    sub r3, r3, r8
@@ -501,26 +389,18 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9BE-LABEL: combine_urem_udiv:
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 6
-; P9BE-NEXT:    lis r5, 22765
+; P9BE-NEXT:    lis r5, 689
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    ori r5, r5, 8969
+; P9BE-NEXT:    ori r5, r5, 55879
 ; P9BE-NEXT:    clrlwi r4, r3, 16
-; P9BE-NEXT:    mulhwu r6, r4, r5
-; P9BE-NEXT:    sub r4, r4, r6
-; P9BE-NEXT:    srwi r4, r4, 1
-; P9BE-NEXT:    add r4, r4, r6
-; P9BE-NEXT:    srwi r4, r4, 6
+; P9BE-NEXT:    mulhwu r4, r4, r5
 ; P9BE-NEXT:    mulli r6, r4, 95
 ; P9BE-NEXT:    sub r3, r3, r6
 ; P9BE-NEXT:    mtfprwz f0, r3
 ; P9BE-NEXT:    li r3, 4
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r6, r3, 16
-; P9BE-NEXT:    mulhwu r7, r6, r5
-; P9BE-NEXT:    sub r6, r6, r7
-; P9BE-NEXT:    srwi r6, r6, 1
-; P9BE-NEXT:    add r6, r6, r7
-; P9BE-NEXT:    srwi r6, r6, 6
+; P9BE-NEXT:    mulhwu r6, r6, r5
 ; P9BE-NEXT:    mulli r7, r6, 95
 ; P9BE-NEXT:    sub r3, r3, r7
 ; P9BE-NEXT:    mtfprwz f1, r3
@@ -531,11 +411,7 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r7, r3, 16
 ; P9BE-NEXT:    xxperm vs0, vs1, vs2
-; P9BE-NEXT:    mulhwu r8, r7, r5
-; P9BE-NEXT:    sub r7, r7, r8
-; P9BE-NEXT:    srwi r7, r7, 1
-; P9BE-NEXT:    add r7, r7, r8
-; P9BE-NEXT:    srwi r7, r7, 6
+; P9BE-NEXT:    mulhwu r7, r7, r5
 ; P9BE-NEXT:    mulli r8, r7, 95
 ; P9BE-NEXT:    sub r3, r3, r8
 ; P9BE-NEXT:    mtfprwz f1, r3
@@ -543,10 +419,6 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    mulhwu r5, r3, r5
-; P9BE-NEXT:    sub r8, r3, r5
-; P9BE-NEXT:    srwi r8, r8, 1
-; P9BE-NEXT:    add r5, r8, r5
-; P9BE-NEXT:    srwi r5, r5, 6
 ; P9BE-NEXT:    mulli r8, r5, 95
 ; P9BE-NEXT:    sub r3, r3, r8
 ; P9BE-NEXT:    mtfprwz f3, r3
@@ -565,9 +437,8 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P8LE-LABEL: combine_urem_udiv:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r3, 22765
-; P8LE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT:    ori r3, r3, 8969
+; P8LE-NEXT:    lis r3, 689
+; P8LE-NEXT:    ori r3, r3, 55879
 ; P8LE-NEXT:    mffprd r4, f0
 ; P8LE-NEXT:    clrldi r5, r4, 48
 ; P8LE-NEXT:    rldicl r6, r4, 48, 48
@@ -576,41 +447,24 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P8LE-NEXT:    rldicl r7, r4, 32, 48
 ; P8LE-NEXT:    rldicl r4, r4, 16, 48
 ; P8LE-NEXT:    mulhwu r9, r5, r3
-; P8LE-NEXT:    mulhwu r11, r8, r3
+; P8LE-NEXT:    mulhwu r8, r8, r3
 ; P8LE-NEXT:    clrlwi r10, r7, 16
-; P8LE-NEXT:    clrlwi r12, r4, 16
-; P8LE-NEXT:    mulhwu r0, r10, r3
-; P8LE-NEXT:    mulhwu r3, r12, r3
-; P8LE-NEXT:    sub r30, r5, r9
-; P8LE-NEXT:    sub r8, r8, r11
-; P8LE-NEXT:    srwi r30, r30, 1
-; P8LE-NEXT:    srwi r8, r8, 1
-; P8LE-NEXT:    sub r10, r10, r0
-; P8LE-NEXT:    add r9, r30, r9
-; P8LE-NEXT:    add r8, r8, r11
-; P8LE-NEXT:    sub r11, r12, r3
-; P8LE-NEXT:    srwi r10, r10, 1
-; P8LE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
-; P8LE-NEXT:    srwi r9, r9, 6
-; P8LE-NEXT:    srwi r11, r11, 1
-; P8LE-NEXT:    srwi r8, r8, 6
-; P8LE-NEXT:    add r10, r10, r0
-; P8LE-NEXT:    mulli r12, r9, 95
-; P8LE-NEXT:    add r3, r11, r3
+; P8LE-NEXT:    clrlwi r11, r4, 16
+; P8LE-NEXT:    mulhwu r10, r10, r3
+; P8LE-NEXT:    mulhwu r3, r11, r3
+; P8LE-NEXT:    mulli r11, r9, 95
 ; P8LE-NEXT:    mtvsrd v2, r9
-; P8LE-NEXT:    srwi r10, r10, 6
 ; P8LE-NEXT:    mulli r9, r8, 95
-; P8LE-NEXT:    srwi r3, r3, 6
 ; P8LE-NEXT:    mtvsrd v3, r8
 ; P8LE-NEXT:    mulli r8, r10, 95
 ; P8LE-NEXT:    mtvsrd v4, r10
 ; P8LE-NEXT:    mulli r10, r3, 95
 ; P8LE-NEXT:    vmrghh v2, v3, v2
-; P8LE-NEXT:    sub r5, r5, r12
+; P8LE-NEXT:    sub r5, r5, r11
 ; P8LE-NEXT:    sub r6, r6, r9
 ; P8LE-NEXT:    mtvsrd v3, r5
-; P8LE-NEXT:    mtvsrd v5, r6
 ; P8LE-NEXT:    sub r5, r7, r8
+; P8LE-NEXT:    mtvsrd v5, r6
 ; P8LE-NEXT:    sub r4, r4, r10
 ; P8LE-NEXT:    mtvsrd v0, r5
 ; P8LE-NEXT:    mtvsrd v1, r4
@@ -625,58 +479,42 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ;
 ; P8BE-LABEL: combine_urem_udiv:
 ; P8BE:       # %bb.0:
-; P8BE-NEXT:    mfvsrd r5, v2
-; P8BE-NEXT:    lis r4, 22765
-; P8BE-NEXT:    ori r4, r4, 8969
-; P8BE-NEXT:    clrldi r3, r5, 48
-; P8BE-NEXT:    rldicl r6, r5, 48, 48
-; P8BE-NEXT:    clrlwi r8, r3, 16
+; P8BE-NEXT:    mfvsrd r4, v2
+; P8BE-NEXT:    lis r3, 689
+; P8BE-NEXT:    addis r11, r2, .LCPI2_0 at toc@ha
+; P8BE-NEXT:    ori r3, r3, 55879
+; P8BE-NEXT:    addi r11, r11, .LCPI2_0 at toc@l
+; P8BE-NEXT:    clrldi r5, r4, 48
+; P8BE-NEXT:    rldicl r6, r4, 48, 48
+; P8BE-NEXT:    lxvw4x v2, 0, r11
+; P8BE-NEXT:    clrlwi r8, r5, 16
 ; P8BE-NEXT:    clrlwi r9, r6, 16
-; P8BE-NEXT:    rldicl r7, r5, 32, 48
-; P8BE-NEXT:    rldicl r5, r5, 16, 48
-; P8BE-NEXT:    mulhwu r10, r8, r4
-; P8BE-NEXT:    mulhwu r12, r9, r4
-; P8BE-NEXT:    clrlwi r11, r7, 16
-; P8BE-NEXT:    clrlwi r5, r5, 16
-; P8BE-NEXT:    mulhwu r0, r11, r4
-; P8BE-NEXT:    mulhwu r4, r5, r4
-; P8BE-NEXT:    sub r8, r8, r10
-; P8BE-NEXT:    sub r9, r9, r12
-; P8BE-NEXT:    srwi r8, r8, 1
-; P8BE-NEXT:    srwi r9, r9, 1
-; P8BE-NEXT:    sub r11, r11, r0
-; P8BE-NEXT:    add r8, r8, r10
-; P8BE-NEXT:    add r9, r9, r12
-; P8BE-NEXT:    sub r12, r5, r4
-; P8BE-NEXT:    addis r10, r2, .LCPI2_0 at toc@ha
-; P8BE-NEXT:    srwi r11, r11, 1
-; P8BE-NEXT:    srwi r8, r8, 6
-; P8BE-NEXT:    srwi r12, r12, 1
-; P8BE-NEXT:    srwi r9, r9, 6
-; P8BE-NEXT:    addi r10, r10, .LCPI2_0 at toc@l
-; P8BE-NEXT:    add r11, r11, r0
-; P8BE-NEXT:    mulli r0, r8, 95
-; P8BE-NEXT:    add r4, r12, r4
+; P8BE-NEXT:    rldicl r7, r4, 32, 48
+; P8BE-NEXT:    rldicl r4, r4, 16, 48
+; P8BE-NEXT:    mulhwu r8, r8, r3
+; P8BE-NEXT:    mulhwu r9, r9, r3
+; P8BE-NEXT:    clrlwi r10, r7, 16
+; P8BE-NEXT:    clrlwi r4, r4, 16
+; P8BE-NEXT:    mulhwu r10, r10, r3
+; P8BE-NEXT:    mulhwu r3, r4, r3
+; P8BE-NEXT:    mulli r12, r8, 95
 ; P8BE-NEXT:    mtvsrwz v3, r8
-; P8BE-NEXT:    lxvw4x v2, 0, r10
-; P8BE-NEXT:    srwi r10, r11, 6
 ; P8BE-NEXT:    mulli r8, r9, 95
-; P8BE-NEXT:    srwi r4, r4, 6
 ; P8BE-NEXT:    mtvsrwz v4, r9
 ; P8BE-NEXT:    mulli r9, r10, 95
 ; P8BE-NEXT:    mtvsrwz v5, r10
-; P8BE-NEXT:    mulli r10, r4, 95
+; P8BE-NEXT:    mulli r10, r3, 95
 ; P8BE-NEXT:    vperm v3, v4, v3, v2
-; P8BE-NEXT:    sub r3, r3, r0
+; P8BE-NEXT:    sub r5, r5, r12
 ; P8BE-NEXT:    sub r6, r6, r8
-; P8BE-NEXT:    mtvsrwz v4, r3
+; P8BE-NEXT:    mtvsrwz v4, r5
+; P8BE-NEXT:    sub r5, r7, r9
 ; P8BE-NEXT:    mtvsrwz v0, r6
-; P8BE-NEXT:    sub r3, r7, r9
-; P8BE-NEXT:    sub r5, r5, r10
-; P8BE-NEXT:    mtvsrwz v1, r3
-; P8BE-NEXT:    mtvsrwz v6, r5
+; P8BE-NEXT:    sub r4, r4, r10
+; P8BE-NEXT:    mtvsrwz v1, r5
+; P8BE-NEXT:    mtvsrwz v6, r4
 ; P8BE-NEXT:    vperm v4, v0, v4, v2
-; P8BE-NEXT:    mtvsrwz v0, r4
+; P8BE-NEXT:    mtvsrwz v0, r3
 ; P8BE-NEXT:    vperm v1, v6, v1, v2
 ; P8BE-NEXT:    vperm v2, v0, v5, v2
 ; P8BE-NEXT:    xxmrghw v4, v1, v4
@@ -694,9 +532,9 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P9LE-LABEL: dont_fold_urem_power_of_two:
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 0
-; P9LE-NEXT:    lis r4, 22765
+; P9LE-NEXT:    lis r4, 689
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    ori r4, r4, 8969
+; P9LE-NEXT:    ori r4, r4, 55879
 ; P9LE-NEXT:    clrlwi r3, r3, 26
 ; P9LE-NEXT:    mtvsrd v3, r3
 ; P9LE-NEXT:    li r3, 2
@@ -708,10 +546,6 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P9LE-NEXT:    vmrghh v3, v4, v3
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r4, r3, r4
-; P9LE-NEXT:    sub r5, r3, r4
-; P9LE-NEXT:    srwi r5, r5, 1
-; P9LE-NEXT:    add r4, r5, r4
-; P9LE-NEXT:    srwi r4, r4, 6
 ; P9LE-NEXT:    mulli r4, r4, 95
 ; P9LE-NEXT:    sub r3, r3, r4
 ; P9LE-NEXT:    mtvsrd v4, r3
@@ -726,9 +560,9 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P9BE-LABEL: dont_fold_urem_power_of_two:
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 2
-; P9BE-NEXT:    lis r4, 22765
+; P9BE-NEXT:    lis r4, 689
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    ori r4, r4, 8969
+; P9BE-NEXT:    ori r4, r4, 55879
 ; P9BE-NEXT:    clrlwi r3, r3, 27
 ; P9BE-NEXT:    mtfprwz f0, r3
 ; P9BE-NEXT:    li r3, 0
@@ -743,10 +577,6 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    xxperm vs0, vs1, vs2
 ; P9BE-NEXT:    mulhwu r4, r3, r4
-; P9BE-NEXT:    sub r5, r3, r4
-; P9BE-NEXT:    srwi r5, r5, 1
-; P9BE-NEXT:    add r4, r5, r4
-; P9BE-NEXT:    srwi r4, r4, 6
 ; P9BE-NEXT:    mulli r4, r4, 95
 ; P9BE-NEXT:    sub r3, r3, r4
 ; P9BE-NEXT:    mtfprwz f1, r3
@@ -761,50 +591,41 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P8LE-LABEL: dont_fold_urem_power_of_two:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r3, 22765
-; P8LE-NEXT:    ori r3, r3, 8969
+; P8LE-NEXT:    lis r3, 689
+; P8LE-NEXT:    ori r3, r3, 55879
 ; P8LE-NEXT:    mffprd r4, f0
 ; P8LE-NEXT:    rldicl r5, r4, 16, 48
-; P8LE-NEXT:    rldicl r7, r4, 48, 48
-; P8LE-NEXT:    clrlwi r5, r5, 16
-; P8LE-NEXT:    mulhwu r3, r5, r3
-; P8LE-NEXT:    sub r6, r5, r3
-; P8LE-NEXT:    srwi r6, r6, 1
-; P8LE-NEXT:    add r3, r6, r3
 ; P8LE-NEXT:    clrldi r6, r4, 48
-; P8LE-NEXT:    srwi r3, r3, 6
+; P8LE-NEXT:    clrlwi r5, r5, 16
 ; P8LE-NEXT:    clrlwi r6, r6, 26
-; P8LE-NEXT:    mulli r3, r3, 95
-; P8LE-NEXT:    rldicl r4, r4, 32, 48
+; P8LE-NEXT:    mulhwu r3, r5, r3
+; P8LE-NEXT:    rldicl r7, r4, 48, 48
 ; P8LE-NEXT:    mtvsrd v2, r6
+; P8LE-NEXT:    rldicl r4, r4, 32, 48
 ; P8LE-NEXT:    clrlwi r6, r7, 27
 ; P8LE-NEXT:    clrlwi r4, r4, 29
 ; P8LE-NEXT:    mtvsrd v3, r6
-; P8LE-NEXT:    mtvsrd v5, r4
+; P8LE-NEXT:    mtvsrd v4, r4
+; P8LE-NEXT:    mulli r3, r3, 95
 ; P8LE-NEXT:    vmrghh v2, v3, v2
 ; P8LE-NEXT:    sub r3, r5, r3
-; P8LE-NEXT:    mtvsrd v4, r3
-; P8LE-NEXT:    vmrghh v3, v4, v5
+; P8LE-NEXT:    mtvsrd v5, r3
+; P8LE-NEXT:    vmrghh v3, v5, v4
 ; P8LE-NEXT:    xxmrglw v2, v3, v2
 ; P8LE-NEXT:    blr
 ;
 ; P8BE-LABEL: dont_fold_urem_power_of_two:
 ; P8BE:       # %bb.0:
 ; P8BE-NEXT:    mfvsrd r4, v2
-; P8BE-NEXT:    lis r3, 22765
+; P8BE-NEXT:    lis r3, 689
 ; P8BE-NEXT:    addis r7, r2, .LCPI3_0 at toc@ha
-; P8BE-NEXT:    ori r3, r3, 8969
+; P8BE-NEXT:    ori r3, r3, 55879
 ; P8BE-NEXT:    clrldi r5, r4, 48
-; P8BE-NEXT:    rldicl r8, r4, 16, 48
-; P8BE-NEXT:    clrlwi r5, r5, 16
-; P8BE-NEXT:    mulhwu r3, r5, r3
-; P8BE-NEXT:    sub r6, r5, r3
-; P8BE-NEXT:    srwi r6, r6, 1
-; P8BE-NEXT:    add r3, r6, r3
 ; P8BE-NEXT:    rldicl r6, r4, 32, 48
-; P8BE-NEXT:    srwi r3, r3, 6
+; P8BE-NEXT:    clrlwi r5, r5, 16
 ; P8BE-NEXT:    clrlwi r6, r6, 27
-; P8BE-NEXT:    mulli r3, r3, 95
+; P8BE-NEXT:    mulhwu r3, r5, r3
+; P8BE-NEXT:    rldicl r8, r4, 16, 48
 ; P8BE-NEXT:    mtvsrwz v2, r6
 ; P8BE-NEXT:    addi r6, r7, .LCPI3_0 at toc@l
 ; P8BE-NEXT:    rldicl r4, r4, 48, 48
@@ -813,8 +634,9 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P8BE-NEXT:    clrlwi r4, r4, 29
 ; P8BE-NEXT:    mtvsrwz v4, r7
 ; P8BE-NEXT:    mtvsrwz v0, r4
-; P8BE-NEXT:    sub r3, r5, r3
+; P8BE-NEXT:    mulli r3, r3, 95
 ; P8BE-NEXT:    vperm v2, v4, v2, v3
+; P8BE-NEXT:    sub r3, r5, r3
 ; P8BE-NEXT:    mtvsrwz v5, r3
 ; P8BE-NEXT:    vperm v3, v0, v5, v3
 ; P8BE-NEXT:    xxmrghw v2, v2, v3
@@ -828,36 +650,32 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; P9LE-LABEL: dont_fold_urem_one:
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 4
-; P9LE-NEXT:    lis r4, -19946
-; P9LE-NEXT:    lis r5, -14230
+; P9LE-NEXT:    lis r4, 2849
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    ori r4, r4, 17097
-; P9LE-NEXT:    ori r5, r5, 30865
+; P9LE-NEXT:    ori r4, r4, 25645
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r4, r3, r4
-; P9LE-NEXT:    srwi r4, r4, 4
 ; P9LE-NEXT:    mulli r4, r4, 23
 ; P9LE-NEXT:    sub r3, r3, r4
-; P9LE-NEXT:    lis r4, 24749
+; P9LE-NEXT:    lis r4, 12
 ; P9LE-NEXT:    mtvsrd v3, r3
 ; P9LE-NEXT:    li r3, 6
-; P9LE-NEXT:    ori r4, r4, 47143
+; P9LE-NEXT:    ori r4, r4, 5560
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r3, r3, 16
 ; P9LE-NEXT:    mulhwu r4, r3, r4
-; P9LE-NEXT:    srwi r4, r4, 11
 ; P9LE-NEXT:    mulli r4, r4, 5423
 ; P9LE-NEXT:    sub r3, r3, r4
+; P9LE-NEXT:    lis r4, 100
 ; P9LE-NEXT:    mtvsrd v4, r3
 ; P9LE-NEXT:    li r3, 2
+; P9LE-NEXT:    ori r4, r4, 13629
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    vmrghh v3, v4, v3
-; P9LE-NEXT:    clrlwi r4, r3, 16
-; P9LE-NEXT:    rlwinm r3, r3, 31, 17, 31
-; P9LE-NEXT:    mulhwu r3, r3, r5
-; P9LE-NEXT:    srwi r3, r3, 8
-; P9LE-NEXT:    mulli r3, r3, 654
-; P9LE-NEXT:    sub r3, r4, r3
+; P9LE-NEXT:    clrlwi r3, r3, 16
+; P9LE-NEXT:    mulhwu r4, r3, r4
+; P9LE-NEXT:    mulli r4, r4, 654
+; P9LE-NEXT:    sub r3, r3, r4
 ; P9LE-NEXT:    mtvsrd v2, r3
 ; P9LE-NEXT:    li r3, 0
 ; P9LE-NEXT:    mtvsrd v4, r3
@@ -868,39 +686,35 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; P9BE-LABEL: dont_fold_urem_one:
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 6
-; P9BE-NEXT:    lis r4, 24749
-; P9BE-NEXT:    lis r5, -14230
+; P9BE-NEXT:    lis r4, 12
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    ori r4, r4, 47143
-; P9BE-NEXT:    ori r5, r5, 30865
+; P9BE-NEXT:    ori r4, r4, 5560
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    mulhwu r4, r3, r4
-; P9BE-NEXT:    srwi r4, r4, 11
 ; P9BE-NEXT:    mulli r4, r4, 5423
 ; P9BE-NEXT:    sub r3, r3, r4
-; P9BE-NEXT:    lis r4, -19946
+; P9BE-NEXT:    lis r4, 2849
 ; P9BE-NEXT:    mtfprwz f0, r3
 ; P9BE-NEXT:    li r3, 4
-; P9BE-NEXT:    ori r4, r4, 17097
+; P9BE-NEXT:    ori r4, r4, 25645
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    mulhwu r4, r3, r4
-; P9BE-NEXT:    srwi r4, r4, 4
 ; P9BE-NEXT:    mulli r4, r4, 23
 ; P9BE-NEXT:    sub r3, r3, r4
+; P9BE-NEXT:    lis r4, 100
 ; P9BE-NEXT:    mtfprwz f1, r3
 ; P9BE-NEXT:    addis r3, r2, .LCPI4_0 at toc@ha
+; P9BE-NEXT:    ori r4, r4, 13629
 ; P9BE-NEXT:    addi r3, r3, .LCPI4_0 at toc@l
 ; P9BE-NEXT:    lxv vs2, 0(r3)
 ; P9BE-NEXT:    li r3, 2
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    clrlwi r4, r3, 16
-; P9BE-NEXT:    rlwinm r3, r3, 31, 17, 31
+; P9BE-NEXT:    clrlwi r3, r3, 16
 ; P9BE-NEXT:    xxperm vs0, vs1, vs2
-; P9BE-NEXT:    mulhwu r3, r3, r5
-; P9BE-NEXT:    srwi r3, r3, 8
-; P9BE-NEXT:    mulli r3, r3, 654
-; P9BE-NEXT:    sub r3, r4, r3
+; P9BE-NEXT:    mulhwu r4, r3, r4
+; P9BE-NEXT:    mulli r4, r4, 654
+; P9BE-NEXT:    sub r3, r3, r4
 ; P9BE-NEXT:    mtfprwz f1, r3
 ; P9BE-NEXT:    li r3, 0
 ; P9BE-NEXT:    mtfprwz f3, r3
@@ -911,29 +725,25 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; P8LE-LABEL: dont_fold_urem_one:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r3, -14230
-; P8LE-NEXT:    lis r7, -19946
-; P8LE-NEXT:    lis r9, 24749
-; P8LE-NEXT:    ori r3, r3, 30865
-; P8LE-NEXT:    ori r7, r7, 17097
+; P8LE-NEXT:    lis r3, 100
+; P8LE-NEXT:    lis r7, 2849
+; P8LE-NEXT:    lis r8, 12
+; P8LE-NEXT:    li r9, 0
+; P8LE-NEXT:    ori r3, r3, 13629
+; P8LE-NEXT:    ori r7, r7, 25645
+; P8LE-NEXT:    ori r8, r8, 5560
+; P8LE-NEXT:    mtvsrd v2, r9
 ; P8LE-NEXT:    mffprd r4, f0
 ; P8LE-NEXT:    rldicl r5, r4, 48, 48
 ; P8LE-NEXT:    rldicl r6, r4, 32, 48
 ; P8LE-NEXT:    rldicl r4, r4, 16, 48
-; P8LE-NEXT:    rlwinm r8, r5, 31, 17, 31
-; P8LE-NEXT:    clrlwi r6, r6, 16
 ; P8LE-NEXT:    clrlwi r5, r5, 16
-; P8LE-NEXT:    mulhwu r3, r8, r3
-; P8LE-NEXT:    ori r8, r9, 47143
+; P8LE-NEXT:    clrlwi r6, r6, 16
+; P8LE-NEXT:    mulhwu r3, r5, r3
 ; P8LE-NEXT:    clrlwi r4, r4, 16
-; P8LE-NEXT:    li r9, 0
 ; P8LE-NEXT:    mulhwu r7, r6, r7
 ; P8LE-NEXT:    mulhwu r8, r4, r8
-; P8LE-NEXT:    mtvsrd v2, r9
-; P8LE-NEXT:    srwi r3, r3, 8
-; P8LE-NEXT:    srwi r7, r7, 4
 ; P8LE-NEXT:    mulli r3, r3, 654
-; P8LE-NEXT:    srwi r8, r8, 11
 ; P8LE-NEXT:    mulli r7, r7, 23
 ; P8LE-NEXT:    mulli r8, r8, 5423
 ; P8LE-NEXT:    sub r3, r5, r3
@@ -950,37 +760,33 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; P8BE-LABEL: dont_fold_urem_one:
 ; P8BE:       # %bb.0:
 ; P8BE-NEXT:    mfvsrd r4, v2
-; P8BE-NEXT:    lis r3, 24749
-; P8BE-NEXT:    lis r7, -19946
-; P8BE-NEXT:    lis r8, -14230
+; P8BE-NEXT:    lis r3, 12
+; P8BE-NEXT:    lis r7, 2849
+; P8BE-NEXT:    lis r8, 100
+; P8BE-NEXT:    addis r9, r2, .LCPI4_0 at toc@ha
 ; P8BE-NEXT:    li r10, 0
-; P8BE-NEXT:    ori r3, r3, 47143
-; P8BE-NEXT:    ori r7, r7, 17097
-; P8BE-NEXT:    ori r8, r8, 30865
+; P8BE-NEXT:    ori r3, r3, 5560
+; P8BE-NEXT:    ori r7, r7, 25645
+; P8BE-NEXT:    ori r8, r8, 13629
 ; P8BE-NEXT:    mtvsrwz v2, r10
 ; P8BE-NEXT:    clrldi r5, r4, 48
 ; P8BE-NEXT:    rldicl r6, r4, 48, 48
-; P8BE-NEXT:    clrlwi r5, r5, 16
 ; P8BE-NEXT:    rldicl r4, r4, 32, 48
+; P8BE-NEXT:    clrlwi r5, r5, 16
 ; P8BE-NEXT:    clrlwi r6, r6, 16
 ; P8BE-NEXT:    mulhwu r3, r5, r3
-; P8BE-NEXT:    rlwinm r9, r4, 31, 17, 31
+; P8BE-NEXT:    clrlwi r4, r4, 16
 ; P8BE-NEXT:    mulhwu r7, r6, r7
-; P8BE-NEXT:    mulhwu r8, r9, r8
-; P8BE-NEXT:    addis r9, r2, .LCPI4_0 at toc@ha
-; P8BE-NEXT:    srwi r3, r3, 11
+; P8BE-NEXT:    mulhwu r8, r4, r8
 ; P8BE-NEXT:    mulli r3, r3, 5423
-; P8BE-NEXT:    srwi r7, r7, 4
-; P8BE-NEXT:    srwi r8, r8, 8
 ; P8BE-NEXT:    mulli r7, r7, 23
 ; P8BE-NEXT:    mulli r8, r8, 654
 ; P8BE-NEXT:    sub r3, r5, r3
 ; P8BE-NEXT:    addi r5, r9, .LCPI4_0 at toc@l
-; P8BE-NEXT:    mtvsrwz v4, r3
-; P8BE-NEXT:    clrlwi r3, r4, 16
 ; P8BE-NEXT:    lxvw4x v3, 0, r5
 ; P8BE-NEXT:    sub r5, r6, r7
-; P8BE-NEXT:    sub r3, r3, r8
+; P8BE-NEXT:    mtvsrwz v4, r3
+; P8BE-NEXT:    sub r3, r4, r8
 ; P8BE-NEXT:    mtvsrwz v5, r5
 ; P8BE-NEXT:    mtvsrwz v0, r3
 ; P8BE-NEXT:    vperm v4, v5, v4, v3

diff  --git a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
index e8365fe46ec61..d1622728db47a 100644
--- a/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/RISCV/urem-vector-lkk.ll
@@ -58,35 +58,27 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    lhu a3, 8(a1)
 ; RV32IM-NEXT:    lhu a4, 0(a1)
 ; RV32IM-NEXT:    lhu a1, 4(a1)
-; RV32IM-NEXT:    lui a5, 364242
-; RV32IM-NEXT:    addi a5, a5, 777
+; RV32IM-NEXT:    lui a5, 11038
+; RV32IM-NEXT:    addi a5, a5, -1465
 ; RV32IM-NEXT:    mulhu a5, a4, a5
-; RV32IM-NEXT:    sub a6, a4, a5
-; RV32IM-NEXT:    srli a6, a6, 1
-; RV32IM-NEXT:    add a5, a6, a5
-; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
 ; RV32IM-NEXT:    sub a4, a4, a5
-; RV32IM-NEXT:    srli a5, a1, 2
-; RV32IM-NEXT:    lui a6, 135300
-; RV32IM-NEXT:    addi a6, a6, 529
-; RV32IM-NEXT:    mulhu a5, a5, a6
-; RV32IM-NEXT:    srli a5, a5, 2
+; RV32IM-NEXT:    lui a5, 8456
+; RV32IM-NEXT:    addi a5, a5, 1058
+; RV32IM-NEXT:    mulhu a5, a1, a5
 ; RV32IM-NEXT:    li a6, 124
 ; RV32IM-NEXT:    mul a5, a5, a6
 ; RV32IM-NEXT:    sub a1, a1, a5
-; RV32IM-NEXT:    lui a5, 342392
-; RV32IM-NEXT:    addi a5, a5, 669
+; RV32IM-NEXT:    lui a5, 10700
+; RV32IM-NEXT:    addi a5, a5, -1003
 ; RV32IM-NEXT:    mulhu a5, a3, a5
-; RV32IM-NEXT:    srli a5, a5, 5
 ; RV32IM-NEXT:    li a6, 98
 ; RV32IM-NEXT:    mul a5, a5, a6
 ; RV32IM-NEXT:    sub a3, a3, a5
-; RV32IM-NEXT:    lui a5, 267633
-; RV32IM-NEXT:    addi a5, a5, -1809
+; RV32IM-NEXT:    lui a5, 1045
+; RV32IM-NEXT:    addi a5, a5, 1801
 ; RV32IM-NEXT:    mulhu a5, a2, a5
-; RV32IM-NEXT:    srli a5, a5, 8
 ; RV32IM-NEXT:    li a6, 1003
 ; RV32IM-NEXT:    mul a5, a5, a6
 ; RV32IM-NEXT:    sub a2, a2, a5
@@ -147,33 +139,24 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lhu a5, 16(a1)
 ; RV64IM-NEXT:    lhu a1, 8(a1)
 ; RV64IM-NEXT:    mulhu a3, a2, a3
-; RV64IM-NEXT:    sub a6, a2, a3
-; RV64IM-NEXT:    srli a6, a6, 1
-; RV64IM-NEXT:    add a3, a6, a3
-; RV64IM-NEXT:    srli a3, a3, 6
-; RV64IM-NEXT:    li a6, 95
-; RV64IM-NEXT:    lui a7, %hi(.LCPI0_1)
-; RV64IM-NEXT:    ld a7, %lo(.LCPI0_1)(a7)
-; RV64IM-NEXT:    mulw a3, a3, a6
+; RV64IM-NEXT:    lui a6, %hi(.LCPI0_1)
+; RV64IM-NEXT:    ld a6, %lo(.LCPI0_1)(a6)
+; RV64IM-NEXT:    li a7, 95
+; RV64IM-NEXT:    mulw a3, a3, a7
 ; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    srli a3, a1, 2
-; RV64IM-NEXT:    mulhu a3, a3, a7
-; RV64IM-NEXT:    srli a3, a3, 3
-; RV64IM-NEXT:    li a6, 124
-; RV64IM-NEXT:    lui a7, %hi(.LCPI0_2)
-; RV64IM-NEXT:    ld a7, %lo(.LCPI0_2)(a7)
-; RV64IM-NEXT:    mulw a3, a3, a6
+; RV64IM-NEXT:    mulhu a3, a1, a6
+; RV64IM-NEXT:    lui a6, %hi(.LCPI0_2)
+; RV64IM-NEXT:    ld a6, %lo(.LCPI0_2)(a6)
+; RV64IM-NEXT:    li a7, 124
+; RV64IM-NEXT:    mulw a3, a3, a7
 ; RV64IM-NEXT:    subw a1, a1, a3
-; RV64IM-NEXT:    srli a3, a5, 1
-; RV64IM-NEXT:    mulhu a3, a3, a7
-; RV64IM-NEXT:    srli a3, a3, 4
+; RV64IM-NEXT:    mulhu a3, a5, a6
 ; RV64IM-NEXT:    lui a6, %hi(.LCPI0_3)
 ; RV64IM-NEXT:    ld a6, %lo(.LCPI0_3)(a6)
 ; RV64IM-NEXT:    li a7, 98
 ; RV64IM-NEXT:    mulw a3, a3, a7
 ; RV64IM-NEXT:    subw a5, a5, a3
 ; RV64IM-NEXT:    mulhu a3, a4, a6
-; RV64IM-NEXT:    srli a3, a3, 7
 ; RV64IM-NEXT:    li a6, 1003
 ; RV64IM-NEXT:    mulw a3, a3, a6
 ; RV64IM-NEXT:    subw a4, a4, a3
@@ -235,35 +218,19 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    lhu a3, 8(a1)
 ; RV32IM-NEXT:    lhu a4, 0(a1)
 ; RV32IM-NEXT:    lhu a1, 4(a1)
-; RV32IM-NEXT:    lui a5, 364242
-; RV32IM-NEXT:    addi a5, a5, 777
+; RV32IM-NEXT:    lui a5, 11038
+; RV32IM-NEXT:    addi a5, a5, -1465
 ; RV32IM-NEXT:    mulhu a6, a4, a5
-; RV32IM-NEXT:    sub a7, a4, a6
-; RV32IM-NEXT:    srli a7, a7, 1
-; RV32IM-NEXT:    add a6, a7, a6
-; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a4, a4, a6
 ; RV32IM-NEXT:    mulhu a6, a1, a5
-; RV32IM-NEXT:    sub t0, a1, a6
-; RV32IM-NEXT:    srli t0, t0, 1
-; RV32IM-NEXT:    add a6, t0, a6
-; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a1, a1, a6
 ; RV32IM-NEXT:    mulhu a6, a3, a5
-; RV32IM-NEXT:    sub t0, a3, a6
-; RV32IM-NEXT:    srli t0, t0, 1
-; RV32IM-NEXT:    add a6, t0, a6
-; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    mul a6, a6, a7
 ; RV32IM-NEXT:    sub a3, a3, a6
 ; RV32IM-NEXT:    mulhu a5, a2, a5
-; RV32IM-NEXT:    sub a6, a2, a5
-; RV32IM-NEXT:    srli a6, a6, 1
-; RV32IM-NEXT:    add a5, a6, a5
-; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    mul a5, a5, a7
 ; RV32IM-NEXT:    sub a2, a2, a5
 ; RV32IM-NEXT:    sh a2, 6(a0)
@@ -323,32 +290,16 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lhu a5, 16(a1)
 ; RV64IM-NEXT:    lhu a1, 8(a1)
 ; RV64IM-NEXT:    mulhu a6, a2, a3
-; RV64IM-NEXT:    sub a7, a2, a6
-; RV64IM-NEXT:    srli a7, a7, 1
-; RV64IM-NEXT:    add a6, a7, a6
-; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    li a7, 95
 ; RV64IM-NEXT:    mulw a6, a6, a7
 ; RV64IM-NEXT:    subw a2, a2, a6
 ; RV64IM-NEXT:    mulhu a6, a1, a3
-; RV64IM-NEXT:    sub t0, a1, a6
-; RV64IM-NEXT:    srli t0, t0, 1
-; RV64IM-NEXT:    add a6, t0, a6
-; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    mulw a6, a6, a7
 ; RV64IM-NEXT:    subw a1, a1, a6
 ; RV64IM-NEXT:    mulhu a6, a5, a3
-; RV64IM-NEXT:    sub t0, a5, a6
-; RV64IM-NEXT:    srli t0, t0, 1
-; RV64IM-NEXT:    add a6, t0, a6
-; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    mulw a6, a6, a7
 ; RV64IM-NEXT:    subw a5, a5, a6
 ; RV64IM-NEXT:    mulhu a3, a4, a3
-; RV64IM-NEXT:    sub a6, a4, a3
-; RV64IM-NEXT:    srli a6, a6, 1
-; RV64IM-NEXT:    add a3, a6, a3
-; RV64IM-NEXT:    srli a3, a3, 6
 ; RV64IM-NEXT:    mulw a3, a3, a7
 ; RV64IM-NEXT:    subw a4, a4, a3
 ; RV64IM-NEXT:    sh a4, 6(a0)
@@ -439,32 +390,16 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    lhu a3, 4(a1)
 ; RV32IM-NEXT:    lhu a4, 12(a1)
 ; RV32IM-NEXT:    lhu a1, 8(a1)
-; RV32IM-NEXT:    lui a5, 364242
-; RV32IM-NEXT:    addi a5, a5, 777
+; RV32IM-NEXT:    lui a5, 11038
+; RV32IM-NEXT:    addi a5, a5, -1465
 ; RV32IM-NEXT:    mulhu a6, a4, a5
-; RV32IM-NEXT:    sub a7, a4, a6
-; RV32IM-NEXT:    srli a7, a7, 1
-; RV32IM-NEXT:    add a6, a7, a6
-; RV32IM-NEXT:    srli a6, a6, 6
 ; RV32IM-NEXT:    li a7, 95
 ; RV32IM-NEXT:    mul t0, a6, a7
 ; RV32IM-NEXT:    mulhu t1, a1, a5
-; RV32IM-NEXT:    sub t2, a1, t1
-; RV32IM-NEXT:    srli t2, t2, 1
-; RV32IM-NEXT:    add t1, t2, t1
-; RV32IM-NEXT:    srli t1, t1, 6
 ; RV32IM-NEXT:    mul t2, t1, a7
 ; RV32IM-NEXT:    mulhu t3, a3, a5
-; RV32IM-NEXT:    sub t4, a3, t3
-; RV32IM-NEXT:    srli t4, t4, 1
-; RV32IM-NEXT:    add t3, t4, t3
-; RV32IM-NEXT:    srli t3, t3, 6
 ; RV32IM-NEXT:    mul t4, t3, a7
 ; RV32IM-NEXT:    mulhu a5, a2, a5
-; RV32IM-NEXT:    sub t5, a2, a5
-; RV32IM-NEXT:    srli t5, t5, 1
-; RV32IM-NEXT:    add a5, t5, a5
-; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    mul a7, a5, a7
 ; RV32IM-NEXT:    sub a5, a7, a5
 ; RV32IM-NEXT:    sub a2, a2, a5
@@ -559,29 +494,13 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lhu a5, 8(a1)
 ; RV64IM-NEXT:    lhu a1, 16(a1)
 ; RV64IM-NEXT:    mulhu a6, a2, a3
-; RV64IM-NEXT:    sub a7, a2, a6
-; RV64IM-NEXT:    srli a7, a7, 1
-; RV64IM-NEXT:    add a6, a7, a6
-; RV64IM-NEXT:    srli a6, a6, 6
 ; RV64IM-NEXT:    li a7, 95
 ; RV64IM-NEXT:    mulw t0, a6, a7
 ; RV64IM-NEXT:    mulhu t1, a1, a3
-; RV64IM-NEXT:    sub t2, a1, t1
-; RV64IM-NEXT:    srli t2, t2, 1
-; RV64IM-NEXT:    add t1, t2, t1
-; RV64IM-NEXT:    srli t1, t1, 6
 ; RV64IM-NEXT:    mulw t2, t1, a7
 ; RV64IM-NEXT:    mulhu t3, a5, a3
-; RV64IM-NEXT:    sub t4, a5, t3
-; RV64IM-NEXT:    srli t4, t4, 1
-; RV64IM-NEXT:    add t3, t4, t3
-; RV64IM-NEXT:    srli t3, t3, 6
 ; RV64IM-NEXT:    mulw t4, t3, a7
 ; RV64IM-NEXT:    mulhu a3, a4, a3
-; RV64IM-NEXT:    sub t5, a4, a3
-; RV64IM-NEXT:    srli t5, t5, 1
-; RV64IM-NEXT:    add a3, t5, a3
-; RV64IM-NEXT:    srli a3, a3, 6
 ; RV64IM-NEXT:    mulw a7, a3, a7
 ; RV64IM-NEXT:    subw a3, a7, a3
 ; RV64IM-NEXT:    subw a4, a4, a3
@@ -641,13 +560,9 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV32IM-NEXT:    lhu a3, 4(a1)
 ; RV32IM-NEXT:    lhu a4, 12(a1)
 ; RV32IM-NEXT:    lhu a1, 0(a1)
-; RV32IM-NEXT:    lui a5, 364242
-; RV32IM-NEXT:    addi a5, a5, 777
+; RV32IM-NEXT:    lui a5, 11038
+; RV32IM-NEXT:    addi a5, a5, -1465
 ; RV32IM-NEXT:    mulhu a5, a4, a5
-; RV32IM-NEXT:    sub a6, a4, a5
-; RV32IM-NEXT:    srli a6, a6, 1
-; RV32IM-NEXT:    add a5, a6, a5
-; RV32IM-NEXT:    srli a5, a5, 6
 ; RV32IM-NEXT:    li a6, 95
 ; RV32IM-NEXT:    mul a5, a5, a6
 ; RV32IM-NEXT:    sub a4, a4, a5
@@ -700,10 +615,6 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) nounwind {
 ; RV64IM-NEXT:    lhu a5, 8(a1)
 ; RV64IM-NEXT:    lhu a1, 0(a1)
 ; RV64IM-NEXT:    mulhu a3, a2, a3
-; RV64IM-NEXT:    sub a6, a2, a3
-; RV64IM-NEXT:    srli a6, a6, 1
-; RV64IM-NEXT:    add a3, a6, a3
-; RV64IM-NEXT:    srli a3, a3, 6
 ; RV64IM-NEXT:    li a6, 95
 ; RV64IM-NEXT:    mulw a3, a3, a6
 ; RV64IM-NEXT:    subw a2, a2, a3
@@ -759,36 +670,32 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ;
 ; RV32IM-LABEL: dont_fold_urem_one:
 ; RV32IM:       # %bb.0:
-; RV32IM-NEXT:    lhu a2, 4(a1)
-; RV32IM-NEXT:    lhu a3, 12(a1)
+; RV32IM-NEXT:    lhu a2, 12(a1)
+; RV32IM-NEXT:    lhu a3, 4(a1)
 ; RV32IM-NEXT:    lhu a1, 8(a1)
-; RV32IM-NEXT:    srli a4, a2, 1
-; RV32IM-NEXT:    lui a5, 820904
-; RV32IM-NEXT:    addi a5, a5, -1903
-; RV32IM-NEXT:    mulhu a4, a4, a5
-; RV32IM-NEXT:    srli a4, a4, 8
+; RV32IM-NEXT:    lui a4, 1603
+; RV32IM-NEXT:    addi a4, a4, 1341
+; RV32IM-NEXT:    mulhu a4, a3, a4
 ; RV32IM-NEXT:    li a5, 654
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a2, a2, a4
-; RV32IM-NEXT:    lui a4, 729444
-; RV32IM-NEXT:    addi a4, a4, 713
+; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    lui a4, 45590
+; RV32IM-NEXT:    addi a4, a4, 1069
 ; RV32IM-NEXT:    mulhu a4, a1, a4
-; RV32IM-NEXT:    srli a4, a4, 4
 ; RV32IM-NEXT:    li a5, 23
 ; RV32IM-NEXT:    mul a4, a4, a5
 ; RV32IM-NEXT:    sub a1, a1, a4
-; RV32IM-NEXT:    lui a4, 395996
-; RV32IM-NEXT:    addi a4, a4, -2009
-; RV32IM-NEXT:    mulhu a4, a3, a4
-; RV32IM-NEXT:    srli a4, a4, 11
+; RV32IM-NEXT:    lui a4, 193
+; RV32IM-NEXT:    addi a4, a4, 1464
+; RV32IM-NEXT:    mulhu a4, a2, a4
 ; RV32IM-NEXT:    lui a5, 1
 ; RV32IM-NEXT:    addi a5, a5, 1327
 ; RV32IM-NEXT:    mul a4, a4, a5
-; RV32IM-NEXT:    sub a3, a3, a4
+; RV32IM-NEXT:    sub a2, a2, a4
 ; RV32IM-NEXT:    sh zero, 0(a0)
-; RV32IM-NEXT:    sh a3, 6(a0)
+; RV32IM-NEXT:    sh a2, 6(a0)
 ; RV32IM-NEXT:    sh a1, 4(a0)
-; RV32IM-NEXT:    sh a2, 2(a0)
+; RV32IM-NEXT:    sh a3, 2(a0)
 ; RV32IM-NEXT:    ret
 ;
 ; RV64I-LABEL: dont_fold_urem_one:
@@ -829,39 +736,32 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) nounwind {
 ;
 ; RV64IM-LABEL: dont_fold_urem_one:
 ; RV64IM:       # %bb.0:
-; RV64IM-NEXT:    lhu a2, 16(a1)
+; RV64IM-NEXT:    lhu a2, 8(a1)
 ; RV64IM-NEXT:    lui a3, %hi(.LCPI4_0)
 ; RV64IM-NEXT:    ld a3, %lo(.LCPI4_0)(a3)
 ; RV64IM-NEXT:    lhu a4, 24(a1)
-; RV64IM-NEXT:    lhu a1, 8(a1)
+; RV64IM-NEXT:    lhu a1, 16(a1)
 ; RV64IM-NEXT:    mulhu a3, a2, a3
-; RV64IM-NEXT:    sub a5, a2, a3
-; RV64IM-NEXT:    srli a5, a5, 1
-; RV64IM-NEXT:    add a3, a5, a3
-; RV64IM-NEXT:    srli a3, a3, 4
-; RV64IM-NEXT:    li a5, 23
-; RV64IM-NEXT:    lui a6, %hi(.LCPI4_1)
-; RV64IM-NEXT:    ld a6, %lo(.LCPI4_1)(a6)
-; RV64IM-NEXT:    mulw a3, a3, a5
+; RV64IM-NEXT:    lui a5, %hi(.LCPI4_1)
+; RV64IM-NEXT:    ld a5, %lo(.LCPI4_1)(a5)
+; RV64IM-NEXT:    li a6, 654
+; RV64IM-NEXT:    mulw a3, a3, a6
 ; RV64IM-NEXT:    subw a2, a2, a3
-; RV64IM-NEXT:    srli a3, a1, 1
-; RV64IM-NEXT:    mulhu a3, a3, a6
-; RV64IM-NEXT:    srli a3, a3, 7
+; RV64IM-NEXT:    mulhu a3, a1, a5
 ; RV64IM-NEXT:    lui a5, %hi(.LCPI4_2)
 ; RV64IM-NEXT:    ld a5, %lo(.LCPI4_2)(a5)
-; RV64IM-NEXT:    li a6, 654
+; RV64IM-NEXT:    li a6, 23
 ; RV64IM-NEXT:    mulw a3, a3, a6
 ; RV64IM-NEXT:    subw a1, a1, a3
 ; RV64IM-NEXT:    mulhu a3, a4, a5
-; RV64IM-NEXT:    srli a3, a3, 12
 ; RV64IM-NEXT:    lui a5, 1
 ; RV64IM-NEXT:    addiw a5, a5, 1327
 ; RV64IM-NEXT:    mulw a3, a3, a5
 ; RV64IM-NEXT:    subw a4, a4, a3
 ; RV64IM-NEXT:    sh zero, 0(a0)
 ; RV64IM-NEXT:    sh a4, 6(a0)
-; RV64IM-NEXT:    sh a1, 2(a0)
-; RV64IM-NEXT:    sh a2, 4(a0)
+; RV64IM-NEXT:    sh a1, 4(a0)
+; RV64IM-NEXT:    sh a2, 2(a0)
 ; RV64IM-NEXT:    ret
   %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
   ret <4 x i16> %1

diff  --git a/llvm/test/CodeGen/X86/divide-by-constant.ll b/llvm/test/CodeGen/X86/divide-by-constant.ll
index cd081051467c1..8c2d26c885138 100644
--- a/llvm/test/CodeGen/X86/divide-by-constant.ll
+++ b/llvm/test/CodeGen/X86/divide-by-constant.ll
@@ -320,10 +320,7 @@ define i64 @PR23590(i64 %x) nounwind {
 ; X64-FAST-NEXT:    movabsq $2635249153387078803, %rcx # imm = 0x2492492492492493
 ; X64-FAST-NEXT:    movq %rdi, %rax
 ; X64-FAST-NEXT:    mulq %rcx
-; X64-FAST-NEXT:    subq %rdx, %rdi
-; X64-FAST-NEXT:    shrq %rdi
-; X64-FAST-NEXT:    leaq (%rdi,%rdx), %rax
-; X64-FAST-NEXT:    shrq $2, %rax
+; X64-FAST-NEXT:    movq %rdx, %rax
 ; X64-FAST-NEXT:    retq
 ;
 ; X64-SLOW-LABEL: PR23590:
@@ -336,11 +333,6 @@ define i64 @PR23590(i64 %x) nounwind {
 ; X64-SLOW-NEXT:    subq %rax, %rdi
 ; X64-SLOW-NEXT:    imulq $613566757, %rdi, %rax # imm = 0x24924925
 ; X64-SLOW-NEXT:    shrq $32, %rax
-; X64-SLOW-NEXT:    subl %eax, %edi
-; X64-SLOW-NEXT:    shrl %edi
-; X64-SLOW-NEXT:    addl %eax, %edi
-; X64-SLOW-NEXT:    shrl $2, %edi
-; X64-SLOW-NEXT:    movq %rdi, %rax
 ; X64-SLOW-NEXT:    retq
 entry:
 	%rem = urem i64 %x, 12345

diff  --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index 404587437f5f3..31874097170d3 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -178,9 +178,8 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X64-AVX2-NEXT:    movq %rdx, %rcx
 ; X64-AVX2-NEXT:    movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
 ; X64-AVX2-NEXT:    andq %rdx, %rax
-; X64-AVX2-NEXT:    movabsq $-2492803253203993461, %rdx # imm = 0xDD67C8A60DD67C8B
+; X64-AVX2-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
 ; X64-AVX2-NEXT:    mulq %rdx
-; X64-AVX2-NEXT:    shrq $5, %rdx
 ; X64-AVX2-NEXT:    leal (%rdx,%rdx,8), %eax
 ; X64-AVX2-NEXT:    leal (%rdx,%rax,4), %eax
 ; X64-AVX2-NEXT:    subl %eax, %ecx
@@ -346,9 +345,8 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X64-AVX2-NEXT:    movq %rdx, %rcx
 ; X64-AVX2-NEXT:    movabsq $137438953471, %rax # imm = 0x1FFFFFFFFF
 ; X64-AVX2-NEXT:    andq %rdx, %rax
-; X64-AVX2-NEXT:    movabsq $-2492803253203993461, %rdx # imm = 0xDD67C8A60DD67C8B
+; X64-AVX2-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
 ; X64-AVX2-NEXT:    mulq %rdx
-; X64-AVX2-NEXT:    shrq $5, %rdx
 ; X64-AVX2-NEXT:    leal (%rdx,%rdx,8), %eax
 ; X64-AVX2-NEXT:    leal (%rdx,%rax,4), %eax
 ; X64-AVX2-NEXT:    subl %eax, %ecx

diff  --git a/llvm/test/CodeGen/X86/pr38217.ll b/llvm/test/CodeGen/X86/pr38217.ll
index 19f9fed78f312..3e363706e2a9c 100644
--- a/llvm/test/CodeGen/X86/pr38217.ll
+++ b/llvm/test/CodeGen/X86/pr38217.ll
@@ -19,8 +19,8 @@ define dso_local void @_Z12d2s_bufferedmPc(i64, i8* nocapture) {
 ; CHECK-NEXT:    imulq $10000, %rdx, %rax # imm = 0x2710
 ; CHECK-NEXT:    movq %rdi, %r9
 ; CHECK-NEXT:    subq %rax, %r9
-; CHECK-NEXT:    imulq $1374389535, %r9, %rax # imm = 0x51EB851F
-; CHECK-NEXT:    shrq $37, %rax
+; CHECK-NEXT:    imulq $42949673, %r9, %rax # imm = 0x28F5C29
+; CHECK-NEXT:    shrq $32, %rax
 ; CHECK-NEXT:    imull $100, %eax, %r10d
 ; CHECK-NEXT:    subl %r10d, %r9d
 ; CHECK-NEXT:    movl %ecx, %r10d

diff  --git a/llvm/unittests/Support/DivisionByConstantTest.cpp b/llvm/unittests/Support/DivisionByConstantTest.cpp
index ab4c52c1e74f8..72babee7be355 100644
--- a/llvm/unittests/Support/DivisionByConstantTest.cpp
+++ b/llvm/unittests/Support/DivisionByConstantTest.cpp
@@ -97,11 +97,22 @@ APInt MULHU(APInt X, APInt Y) {
   return (X.zext(WideBits) * Y.zext(WideBits)).lshr(Bits).trunc(Bits);
 }
 
-APInt UnsignedDivideUsingMagic(APInt Numerator, APInt Divisor,
+APInt UnsignedDivideUsingMagic(const APInt &Numerator, const APInt &Divisor,
+                               bool LZOptimization,
                                bool AllowEvenDivisorOptimization, bool ForceNPQ,
                                UnsignedDivisionByConstantInfo Magics) {
   unsigned Bits = Numerator.getBitWidth();
 
+  if (LZOptimization && !Divisor.isOne()) {
+    unsigned LeadingZeros = Numerator.countLeadingZeros();
+    // Clip to the number of leading zeros in the divisor.
+    LeadingZeros = std::min(LeadingZeros, Divisor.countLeadingZeros());
+    if (LeadingZeros > 0) {
+      Magics = UnsignedDivisionByConstantInfo::get(Divisor, LeadingZeros);
+      assert(!Magics.IsAdd && "Should use cheap fixup now");
+    }
+  }
+
   unsigned PreShift = 0;
   if (AllowEvenDivisorOptimization) {
     // If the divisor is even, we can avoid using the expensive fixup by
@@ -159,7 +170,7 @@ TEST(UnsignedDivisionByConstantTest, Test) {
   for (unsigned Bits = 1; Bits <= 32; ++Bits) {
     if (Bits < 2)
       continue; // Not supported by `UnsignedDivisionByConstantInfo::get()`.
-    if (Bits > 11)
+    if (Bits > 10)
       continue; // Unreasonably slow.
     EnumerateAPInts(Bits, [Bits](const APInt &Divisor) {
       if (Divisor.isZero())
@@ -168,17 +179,20 @@ TEST(UnsignedDivisionByConstantTest, Test) {
           UnsignedDivisionByConstantInfo::get(Divisor);
       EnumerateAPInts(Bits, [Divisor, Magics, Bits](const APInt &Numerator) {
         APInt NativeResult = Numerator.udiv(Divisor);
-        for (bool AllowEvenDivisorOptimization : {true, false}) {
-          for (bool ForceNPQ : {false, true}) {
-            APInt MagicResult = UnsignedDivideUsingMagic(
-                Numerator, Divisor, AllowEvenDivisorOptimization, ForceNPQ,
-                Magics);
-            ASSERT_EQ(MagicResult, NativeResult)
-                << " ... given the operation:  urem i" << Bits << " "
-                << Numerator << ", " << Divisor
-                << " (allow even divisior optimization = "
-                << AllowEvenDivisorOptimization << ", force NPQ = " << ForceNPQ
-                << ")";
+        for (bool LZOptimization : {true, false}) {
+          for (bool AllowEvenDivisorOptimization : {true, false}) {
+            for (bool ForceNPQ : {false, true}) {
+              APInt MagicResult = UnsignedDivideUsingMagic(
+                  Numerator, Divisor, LZOptimization,
+                  AllowEvenDivisorOptimization, ForceNPQ, Magics);
+              ASSERT_EQ(MagicResult, NativeResult)
+                    << " ... given the operation:  urem i" << Bits << " "
+                    << Numerator << ", " << Divisor
+                    << " (allow LZ optimization = "
+                    << LZOptimization << ", allow even divisior optimization = "
+                    << AllowEvenDivisorOptimization << ", force NPQ = "
+                    << ForceNPQ << ")";
+            }
           }
         }
       });


        


More information about the llvm-commits mailing list