[llvm] b631f86 - [TLI][PowerPC] Introduce TLI query to check if MULH is cheaper than MUL + SHIFT

Amy Kwan via llvm-commits llvm-commits at lists.llvm.org
Sat May 23 14:48:48 PDT 2020


Author: Amy Kwan
Date: 2020-05-23T16:47:12-05:00
New Revision: b631f86ac5b9df3f87ae963415d17e35104eca86

URL: https://github.com/llvm/llvm-project/commit/b631f86ac5b9df3f87ae963415d17e35104eca86
DIFF: https://github.com/llvm/llvm-project/commit/b631f86ac5b9df3f87ae963415d17e35104eca86.diff

LOG: [TLI][PowerPC] Introduce TLI query to check if MULH is cheaper than MUL + SHIFT

This patch introduces a TargetLowering query, isMulhCheaperThanMulShift.

Currently in DAG Combine, it will transform mulhs/mulhu into a
wider multiply and a shift if the wide multiply is legal.

This TLI function is implemented on 64-bit PowerPC, as it is more desirable to
have multiply-high over multiply + shift for words and doublewords. Having
multiply-high can also aid in further transformations that can be done.

Differential Revision: https://reviews.llvm.org/D78271

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/lib/Target/PowerPC/PPCISelLowering.h
    llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
    llvm/test/CodeGen/PowerPC/machine-pre.ll
    llvm/test/CodeGen/PowerPC/ppc64-P9-mod.ll
    llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
    llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 522621471af2..2689838b3e7c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1652,6 +1652,10 @@ class TargetLoweringBase {
 
   virtual bool isJumpTableRelative() const;
 
+  /// Return true if a mulh[s|u] node for a specific type is cheaper than
+  /// a multiply followed by a shift. This is false by default.
+  virtual bool isMulhCheaperThanMulShift(EVT Type) const { return false; }
+
   /// If a physical register, this specifies the register that
   /// llvm.savestack/llvm.restorestack should save and restore.
   unsigned getStackPointerRegisterToSaveRestore() const {

diff  --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b076f5e0db49..40ceb5b34ad3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4118,7 +4118,7 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
 
   // If the type twice as wide is legal, transform the mulhs to a wider multiply
   // plus a shift.
-  if (VT.isSimple() && !VT.isVector()) {
+  if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
     MVT Simple = VT.getSimpleVT();
     unsigned SimpleSize = Simple.getSizeInBits();
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
@@ -4174,7 +4174,7 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
 
   // If the type twice as wide is legal, transform the mulhu to a wider multiply
   // plus a shift.
-  if (VT.isSimple() && !VT.isVector()) {
+  if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
     MVT Simple = VT.getSimpleVT();
     unsigned SimpleSize = Simple.getSizeInBits();
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8b1ebba596a0..d42eaa7b7706 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1401,6 +1401,16 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
   return VT.isScalarInteger();
 }
 
+/// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a specific
+/// type is cheaper than a multiply followed by a shift.
+/// This is true for words and doublewords on 64-bit PowerPC.
+bool PPCTargetLowering::isMulhCheaperThanMulShift(EVT Type) const {
+  if (Subtarget.isPPC64() && (isOperationLegal(ISD::MULHS, Type) ||
+                              isOperationLegal(ISD::MULHU, Type)))
+    return true;
+  return TargetLowering::isMulhCheaperThanMulShift(Type);
+}
+
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((PPCISD::NodeType)Opcode) {
   case PPCISD::FIRST_NUMBER:    break;

diff  --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index c34fd6aa78be..29d4e54edc67 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -950,6 +950,11 @@ namespace llvm {
     Register
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
+    /// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a
+    /// specific type is cheaper than a multiply followed by a shift.
+    /// This is true for words and doublewords on 64-bit PowerPC.
+    bool isMulhCheaperThanMulShift(EVT Type) const override;
+
     /// Override to support customized stack guard loading.
     bool useLoadStackGuardNode() const override;
     void insertSSPDeclarations(Module &M) const override;

diff  --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
index 42a2c7828052..2463e9114794 100644
--- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
@@ -509,10 +509,9 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) {
 ; CHECK-NEXT:    bdz .LBB6_9
 ; CHECK-NEXT:  .LBB6_4: #
 ; CHECK-NEXT:    lbzu r0, 1(r5)
-; CHECK-NEXT:    clrldi r27, r0, 32
-; CHECK-NEXT:    mulld r27, r27, r4
-; CHECK-NEXT:    rldicl r27, r27, 31, 33
-; CHECK-NEXT:    slwi r26, r27, 1
+; CHECK-NEXT:    mulhwu r27, r0, r4
+; CHECK-NEXT:    rlwinm r26, r27, 0, 0, 30
+; CHECK-NEXT:    srwi r27, r27, 1
 ; CHECK-NEXT:    add r27, r27, r26
 ; CHECK-NEXT:    subf r0, r27, r0
 ; CHECK-NEXT:    cmplwi r0, 1

diff  --git a/llvm/test/CodeGen/PowerPC/machine-pre.ll b/llvm/test/CodeGen/PowerPC/machine-pre.ll
index 0a7949725723..38ed67c70989 100644
--- a/llvm/test/CodeGen/PowerPC/machine-pre.ll
+++ b/llvm/test/CodeGen/PowerPC/machine-pre.ll
@@ -91,14 +91,12 @@ define dso_local signext i32 @foo(i32 signext %x, i32 signext %y) nounwind {
 ; CHECK-P9-NEXT:    bl bar
 ; CHECK-P9-NEXT:    nop
 ; CHECK-P9-NEXT:    mr r30, r3
-; CHECK-P9-NEXT:    extsw r3, r28
-; CHECK-P9-NEXT:    mulld r4, r3, r27
-; CHECK-P9-NEXT:    rldicl r5, r4, 1, 63
-; CHECK-P9-NEXT:    rldicl r4, r4, 32, 32
-; CHECK-P9-NEXT:    add r4, r4, r5
-; CHECK-P9-NEXT:    slwi r5, r4, 1
-; CHECK-P9-NEXT:    add r4, r4, r5
-; CHECK-P9-NEXT:    subf r3, r4, r3
+; CHECK-P9-NEXT:    mulhw r3, r28, r27
+; CHECK-P9-NEXT:    srwi r4, r3, 31
+; CHECK-P9-NEXT:    add r3, r3, r4
+; CHECK-P9-NEXT:    slwi r4, r3, 1
+; CHECK-P9-NEXT:    add r3, r3, r4
+; CHECK-P9-NEXT:    subf r3, r3, r28
 ; CHECK-P9-NEXT:    cmplwi r3, 1
 ; CHECK-P9-NEXT:    beq cr0, .LBB1_1
 ; CHECK-P9-NEXT:  # %bb.5: # %while.cond

diff  --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-mod.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-mod.ll
index 547f9273f5a4..56299427ab9d 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-mod.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-mod.ll
@@ -205,13 +205,13 @@ entry:
   ret i32 %rem
 ; CHECK-LABEL: modulo_const3_sw
 ; CHECK-NOT: modsw
-; CHECK: mull
+; CHECK: mulh
 ; CHECK-NOT: modsw
 ; CHECK: sub
 ; CHECK-NOT: modsw
 ; CHECK: blr
 ; CHECK-PWR8-LABEL: modulo_const3_sw
-; CHECK-PWR8: mull
+; CHECK-PWR8: mulh
 ; CHECK-PWR8: sub
 ; CHECK-PWR8: blr
 }

diff  --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
index 051e467cf39b..cda3fbb52ee8 100644
--- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
@@ -13,12 +13,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 0
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    extsh r4, r3
 ; P9LE-NEXT:    lis r5, -21386
 ; P9LE-NEXT:    ori r5, r5, 37253
-; P9LE-NEXT:    extsw r4, r4
-; P9LE-NEXT:    mulld r5, r4, r5
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
+; P9LE-NEXT:    extsh r4, r3
+; P9LE-NEXT:    mulhw r5, r4, r5
 ; P9LE-NEXT:    add r4, r5, r4
 ; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 6
@@ -30,10 +28,8 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 2
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r4, r3
-; P9LE-NEXT:    extsw r4, r4
 ; P9LE-NEXT:    ori r5, r5, 63421
-; P9LE-NEXT:    mulld r5, r4, r5
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
+; P9LE-NEXT:    mulhw r5, r4, r5
 ; P9LE-NEXT:    subf r4, r4, r5
 ; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 6
@@ -46,11 +42,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 4
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r4, r3
-; P9LE-NEXT:    extsw r4, r4
 ; P9LE-NEXT:    ori r5, r5, 33437
-; P9LE-NEXT:    mulld r4, r4, r5
-; P9LE-NEXT:    rldicl r5, r4, 1, 63
-; P9LE-NEXT:    rldicl r4, r4, 32, 32
+; P9LE-NEXT:    mulhw r4, r4, r5
+; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 5
 ; P9LE-NEXT:    add r4, r4, r5
 ; P9LE-NEXT:    lis r5, -16728
@@ -61,11 +55,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r4, r3
-; P9LE-NEXT:    extsw r4, r4
 ; P9LE-NEXT:    ori r5, r5, 63249
-; P9LE-NEXT:    mulld r4, r4, r5
-; P9LE-NEXT:    rldicl r5, r4, 1, 63
-; P9LE-NEXT:    rldicl r4, r4, 32, 32
+; P9LE-NEXT:    mulhw r4, r4, r5
+; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 8
 ; P9LE-NEXT:    add r4, r4, r5
 ; P9LE-NEXT:    mulli r4, r4, -1003
@@ -82,12 +74,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 2
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    extsh r3, r3
 ; P9BE-NEXT:    lis r4, 31710
 ; P9BE-NEXT:    ori r4, r4, 63421
-; P9BE-NEXT:    extsw r3, r3
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    extsh r3, r3
+; P9BE-NEXT:    mulhw r4, r3, r4
 ; P9BE-NEXT:    subf r4, r3, r4
 ; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 6
@@ -100,10 +90,8 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 0
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
 ; P9BE-NEXT:    ori r4, r4, 37253
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    mulhw r4, r3, r4
 ; P9BE-NEXT:    add r4, r4, r3
 ; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 6
@@ -116,11 +104,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
 ; P9BE-NEXT:    ori r4, r4, 63249
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r5, r4, 1, 63
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    mulhw r4, r3, r4
+; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 8
 ; P9BE-NEXT:    add r4, r4, r5
 ; P9BE-NEXT:    mulli r4, r4, -1003
@@ -132,11 +118,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 4
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
 ; P9BE-NEXT:    ori r4, r4, 33437
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r5, r4, 1, 63
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    mulhw r4, r3, r4
+; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 5
 ; P9BE-NEXT:    add r4, r4, r5
 ; P9BE-NEXT:    mulli r4, r4, 98
@@ -150,61 +134,51 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; P8LE-LABEL: fold_srem_vec_1:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r4, 21399
-; P8LE-NEXT:    lis r9, -16728
-; P8LE-NEXT:    lis r11, -21386
-; P8LE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT:    ori r4, r4, 33437
-; P8LE-NEXT:    ori r9, r9, 63249
-; P8LE-NEXT:    ori r11, r11, 37253
-; P8LE-NEXT:    mffprd r5, f0
-; P8LE-NEXT:    rldicl r3, r5, 32, 48
-; P8LE-NEXT:    rldicl r6, r5, 16, 48
-; P8LE-NEXT:    clrldi r7, r5, 48
-; P8LE-NEXT:    extsh r8, r3
-; P8LE-NEXT:    extsh r10, r6
-; P8LE-NEXT:    rldicl r5, r5, 48, 48
-; P8LE-NEXT:    extsw r8, r8
-; P8LE-NEXT:    extsh r12, r7
-; P8LE-NEXT:    extsw r10, r10
-; P8LE-NEXT:    mulld r4, r8, r4
-; P8LE-NEXT:    lis r8, 31710
-; P8LE-NEXT:    extsh r0, r5
-; P8LE-NEXT:    extsw r12, r12
-; P8LE-NEXT:    mulld r9, r10, r9
-; P8LE-NEXT:    ori r8, r8, 63421
-; P8LE-NEXT:    extsw r10, r0
-; P8LE-NEXT:    mulld r11, r12, r11
-; P8LE-NEXT:    mulld r8, r10, r8
-; P8LE-NEXT:    rldicl r0, r4, 1, 63
-; P8LE-NEXT:    rldicl r4, r4, 32, 32
-; P8LE-NEXT:    rldicl r30, r9, 1, 63
-; P8LE-NEXT:    rldicl r9, r9, 32, 32
-; P8LE-NEXT:    rldicl r11, r11, 32, 32
-; P8LE-NEXT:    rldicl r8, r8, 32, 32
-; P8LE-NEXT:    add r11, r11, r12
-; P8LE-NEXT:    srawi r4, r4, 5
-; P8LE-NEXT:    subf r8, r10, r8
-; P8LE-NEXT:    srawi r9, r9, 8
-; P8LE-NEXT:    srwi r10, r11, 31
-; P8LE-NEXT:    add r4, r4, r0
-; P8LE-NEXT:    srawi r11, r11, 6
-; P8LE-NEXT:    add r9, r9, r30
-; P8LE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
-; P8LE-NEXT:    add r10, r11, r10
-; P8LE-NEXT:    srwi r11, r8, 31
-; P8LE-NEXT:    srawi r8, r8, 6
-; P8LE-NEXT:    mulli r4, r4, 98
-; P8LE-NEXT:    mulli r9, r9, -1003
-; P8LE-NEXT:    add r8, r8, r11
-; P8LE-NEXT:    mulli r10, r10, 95
-; P8LE-NEXT:    mulli r8, r8, -124
-; P8LE-NEXT:    subf r3, r4, r3
-; P8LE-NEXT:    subf r4, r9, r6
+; P8LE-NEXT:    lis r3, 21399
+; P8LE-NEXT:    lis r9, -21386
+; P8LE-NEXT:    lis r11, 31710
+; P8LE-NEXT:    lis r8, -16728
+; P8LE-NEXT:    ori r3, r3, 33437
+; P8LE-NEXT:    ori r9, r9, 37253
+; P8LE-NEXT:    ori r8, r8, 63249
+; P8LE-NEXT:    mffprd r4, f0
+; P8LE-NEXT:    rldicl r5, r4, 32, 48
+; P8LE-NEXT:    clrldi r7, r4, 48
+; P8LE-NEXT:    rldicl r6, r4, 16, 48
+; P8LE-NEXT:    rldicl r4, r4, 48, 48
+; P8LE-NEXT:    extsh r10, r5
+; P8LE-NEXT:    extsh r0, r7
+; P8LE-NEXT:    mulhw r3, r10, r3
+; P8LE-NEXT:    ori r10, r11, 63421
+; P8LE-NEXT:    extsh r11, r4
+; P8LE-NEXT:    extsh r12, r6
+; P8LE-NEXT:    mulhw r9, r0, r9
+; P8LE-NEXT:    mulhw r10, r11, r10
+; P8LE-NEXT:    mulhw r8, r12, r8
+; P8LE-NEXT:    srwi r12, r3, 31
+; P8LE-NEXT:    srawi r3, r3, 5
+; P8LE-NEXT:    add r9, r9, r0
+; P8LE-NEXT:    subf r10, r11, r10
+; P8LE-NEXT:    add r3, r3, r12
+; P8LE-NEXT:    srwi r11, r9, 31
+; P8LE-NEXT:    srawi r9, r9, 6
+; P8LE-NEXT:    srwi r12, r8, 31
+; P8LE-NEXT:    srawi r8, r8, 8
+; P8LE-NEXT:    add r9, r9, r11
+; P8LE-NEXT:    srwi r11, r10, 31
+; P8LE-NEXT:    srawi r10, r10, 6
+; P8LE-NEXT:    add r8, r8, r12
+; P8LE-NEXT:    mulli r3, r3, 98
+; P8LE-NEXT:    add r10, r10, r11
+; P8LE-NEXT:    mulli r8, r8, -1003
+; P8LE-NEXT:    mulli r9, r9, 95
+; P8LE-NEXT:    mulli r10, r10, -124
+; P8LE-NEXT:    subf r3, r3, r5
+; P8LE-NEXT:    subf r5, r8, r6
 ; P8LE-NEXT:    mtfprd f0, r3
-; P8LE-NEXT:    subf r3, r10, r7
-; P8LE-NEXT:    mtfprd f1, r4
-; P8LE-NEXT:    subf r4, r8, r5
+; P8LE-NEXT:    subf r3, r9, r7
+; P8LE-NEXT:    subf r4, r10, r4
+; P8LE-NEXT:    mtfprd f1, r5
 ; P8LE-NEXT:    mtfprd f2, r3
 ; P8LE-NEXT:    xxswapd v2, vs0
 ; P8LE-NEXT:    mtfprd f3, r4
@@ -220,42 +194,34 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
 ; P8BE:       # %bb.0:
 ; P8BE-NEXT:    mfvsrd r4, v2
 ; P8BE-NEXT:    lis r3, -16728
-; P8BE-NEXT:    lis r9, 31710
 ; P8BE-NEXT:    lis r8, 21399
+; P8BE-NEXT:    lis r9, 31710
 ; P8BE-NEXT:    lis r10, -21386
 ; P8BE-NEXT:    ori r3, r3, 63249
-; P8BE-NEXT:    ori r9, r9, 63421
 ; P8BE-NEXT:    ori r8, r8, 33437
+; P8BE-NEXT:    ori r9, r9, 63421
 ; P8BE-NEXT:    ori r10, r10, 37253
 ; P8BE-NEXT:    clrldi r5, r4, 48
-; P8BE-NEXT:    rldicl r7, r4, 32, 48
 ; P8BE-NEXT:    rldicl r6, r4, 48, 48
-; P8BE-NEXT:    rldicl r4, r4, 16, 48
+; P8BE-NEXT:    rldicl r7, r4, 32, 48
 ; P8BE-NEXT:    extsh r5, r5
-; P8BE-NEXT:    extsh r7, r7
 ; P8BE-NEXT:    extsh r6, r6
-; P8BE-NEXT:    extsw r5, r5
+; P8BE-NEXT:    rldicl r4, r4, 16, 48
+; P8BE-NEXT:    extsh r7, r7
+; P8BE-NEXT:    mulhw r3, r5, r3
 ; P8BE-NEXT:    extsh r4, r4
-; P8BE-NEXT:    extsw r7, r7
-; P8BE-NEXT:    extsw r6, r6
-; P8BE-NEXT:    mulld r3, r5, r3
-; P8BE-NEXT:    extsw r4, r4
-; P8BE-NEXT:    mulld r9, r7, r9
-; P8BE-NEXT:    mulld r8, r6, r8
-; P8BE-NEXT:    mulld r10, r4, r10
-; P8BE-NEXT:    rldicl r11, r3, 1, 63
-; P8BE-NEXT:    rldicl r3, r3, 32, 32
-; P8BE-NEXT:    rldicl r9, r9, 32, 32
-; P8BE-NEXT:    rldicl r12, r8, 1, 63
-; P8BE-NEXT:    rldicl r8, r8, 32, 32
-; P8BE-NEXT:    rldicl r10, r10, 32, 32
-; P8BE-NEXT:    subf r9, r7, r9
+; P8BE-NEXT:    mulhw r8, r6, r8
+; P8BE-NEXT:    mulhw r9, r7, r9
+; P8BE-NEXT:    mulhw r10, r4, r10
+; P8BE-NEXT:    srwi r11, r3, 31
 ; P8BE-NEXT:    srawi r3, r3, 8
+; P8BE-NEXT:    add r3, r3, r11
+; P8BE-NEXT:    srwi r11, r8, 31
+; P8BE-NEXT:    subf r9, r7, r9
 ; P8BE-NEXT:    srawi r8, r8, 5
 ; P8BE-NEXT:    add r10, r10, r4
-; P8BE-NEXT:    add r3, r3, r11
+; P8BE-NEXT:    add r8, r8, r11
 ; P8BE-NEXT:    srwi r11, r9, 31
-; P8BE-NEXT:    add r8, r8, r12
 ; P8BE-NEXT:    srawi r9, r9, 6
 ; P8BE-NEXT:    mulli r3, r3, -1003
 ; P8BE-NEXT:    add r9, r9, r11
@@ -290,12 +256,10 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 0
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    extsh r4, r3
 ; P9LE-NEXT:    lis r5, -21386
 ; P9LE-NEXT:    ori r5, r5, 37253
-; P9LE-NEXT:    extsw r4, r4
-; P9LE-NEXT:    mulld r6, r4, r5
-; P9LE-NEXT:    rldicl r6, r6, 32, 32
+; P9LE-NEXT:    extsh r4, r3
+; P9LE-NEXT:    mulhw r6, r4, r5
 ; P9LE-NEXT:    add r4, r6, r4
 ; P9LE-NEXT:    srwi r6, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 6
@@ -306,9 +270,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 2
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r4, r3
-; P9LE-NEXT:    extsw r4, r4
-; P9LE-NEXT:    mulld r6, r4, r5
-; P9LE-NEXT:    rldicl r6, r6, 32, 32
+; P9LE-NEXT:    mulhw r6, r4, r5
 ; P9LE-NEXT:    add r4, r6, r4
 ; P9LE-NEXT:    srwi r6, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 6
@@ -320,9 +282,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 4
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r4, r3
-; P9LE-NEXT:    extsw r4, r4
-; P9LE-NEXT:    mulld r6, r4, r5
-; P9LE-NEXT:    rldicl r6, r6, 32, 32
+; P9LE-NEXT:    mulhw r6, r4, r5
 ; P9LE-NEXT:    add r4, r6, r4
 ; P9LE-NEXT:    srwi r6, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 6
@@ -334,9 +294,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r4, r3
-; P9LE-NEXT:    extsw r4, r4
-; P9LE-NEXT:    mulld r5, r4, r5
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
+; P9LE-NEXT:    mulhw r5, r4, r5
 ; P9LE-NEXT:    add r4, r5, r4
 ; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 6
@@ -355,12 +313,10 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    extsh r3, r3
 ; P9BE-NEXT:    lis r4, -21386
 ; P9BE-NEXT:    ori r4, r4, 37253
-; P9BE-NEXT:    extsw r3, r3
-; P9BE-NEXT:    mulld r5, r3, r4
-; P9BE-NEXT:    rldicl r5, r5, 32, 32
+; P9BE-NEXT:    extsh r3, r3
+; P9BE-NEXT:    mulhw r5, r3, r4
 ; P9BE-NEXT:    add r5, r5, r3
 ; P9BE-NEXT:    srwi r6, r5, 31
 ; P9BE-NEXT:    srawi r5, r5, 6
@@ -372,9 +328,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 4
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
-; P9BE-NEXT:    mulld r5, r3, r4
-; P9BE-NEXT:    rldicl r5, r5, 32, 32
+; P9BE-NEXT:    mulhw r5, r3, r4
 ; P9BE-NEXT:    add r5, r5, r3
 ; P9BE-NEXT:    srwi r6, r5, 31
 ; P9BE-NEXT:    srawi r5, r5, 6
@@ -386,9 +340,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 2
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
-; P9BE-NEXT:    mulld r5, r3, r4
-; P9BE-NEXT:    rldicl r5, r5, 32, 32
+; P9BE-NEXT:    mulhw r5, r3, r4
 ; P9BE-NEXT:    add r5, r5, r3
 ; P9BE-NEXT:    srwi r6, r5, 31
 ; P9BE-NEXT:    srawi r5, r5, 6
@@ -401,9 +353,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 0
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    mulhw r4, r3, r4
 ; P9BE-NEXT:    add r4, r4, r3
 ; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 6
@@ -419,64 +369,56 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; P8LE-LABEL: fold_srem_vec_2:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r4, -21386
+; P8LE-NEXT:    lis r3, -21386
 ; P8LE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT:    ori r4, r4, 37253
-; P8LE-NEXT:    mffprd r5, f0
-; P8LE-NEXT:    clrldi r3, r5, 48
-; P8LE-NEXT:    rldicl r7, r5, 32, 48
-; P8LE-NEXT:    extsh r8, r3
-; P8LE-NEXT:    rldicl r6, r5, 48, 48
-; P8LE-NEXT:    extsh r10, r7
-; P8LE-NEXT:    rldicl r5, r5, 16, 48
-; P8LE-NEXT:    extsw r8, r8
+; P8LE-NEXT:    ori r3, r3, 37253
+; P8LE-NEXT:    mffprd r4, f0
+; P8LE-NEXT:    clrldi r5, r4, 48
+; P8LE-NEXT:    rldicl r6, r4, 48, 48
+; P8LE-NEXT:    extsh r8, r5
+; P8LE-NEXT:    rldicl r7, r4, 32, 48
 ; P8LE-NEXT:    extsh r9, r6
-; P8LE-NEXT:    extsw r10, r10
-; P8LE-NEXT:    extsh r11, r5
-; P8LE-NEXT:    mulld r12, r8, r4
-; P8LE-NEXT:    extsw r9, r9
-; P8LE-NEXT:    extsw r11, r11
-; P8LE-NEXT:    mulld r30, r10, r4
-; P8LE-NEXT:    mulld r0, r9, r4
-; P8LE-NEXT:    mulld r4, r11, r4
-; P8LE-NEXT:    rldicl r12, r12, 32, 32
-; P8LE-NEXT:    add r8, r12, r8
-; P8LE-NEXT:    rldicl r12, r30, 32, 32
-; P8LE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
-; P8LE-NEXT:    rldicl r0, r0, 32, 32
-; P8LE-NEXT:    rldicl r4, r4, 32, 32
-; P8LE-NEXT:    add r10, r12, r10
-; P8LE-NEXT:    add r9, r0, r9
-; P8LE-NEXT:    srwi r0, r8, 31
-; P8LE-NEXT:    add r4, r4, r11
-; P8LE-NEXT:    srwi r11, r10, 31
+; P8LE-NEXT:    mulhw r10, r8, r3
+; P8LE-NEXT:    rldicl r4, r4, 16, 48
+; P8LE-NEXT:    extsh r11, r7
+; P8LE-NEXT:    mulhw r12, r9, r3
+; P8LE-NEXT:    extsh r0, r4
+; P8LE-NEXT:    mulhw r30, r11, r3
+; P8LE-NEXT:    mulhw r3, r0, r3
+; P8LE-NEXT:    add r8, r10, r8
+; P8LE-NEXT:    add r9, r12, r9
+; P8LE-NEXT:    srwi r10, r8, 31
 ; P8LE-NEXT:    srawi r8, r8, 6
-; P8LE-NEXT:    srawi r10, r10, 6
-; P8LE-NEXT:    srwi r12, r9, 31
-; P8LE-NEXT:    add r8, r8, r0
+; P8LE-NEXT:    add r11, r30, r11
+; P8LE-NEXT:    add r3, r3, r0
+; P8LE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; P8LE-NEXT:    add r8, r8, r10
+; P8LE-NEXT:    srwi r10, r9, 31
 ; P8LE-NEXT:    srawi r9, r9, 6
-; P8LE-NEXT:    add r10, r10, r11
-; P8LE-NEXT:    srwi r11, r4, 31
-; P8LE-NEXT:    srawi r4, r4, 6
-; P8LE-NEXT:    add r9, r9, r12
 ; P8LE-NEXT:    mulli r8, r8, 95
-; P8LE-NEXT:    add r4, r4, r11
+; P8LE-NEXT:    add r9, r9, r10
+; P8LE-NEXT:    srwi r10, r11, 31
+; P8LE-NEXT:    srawi r11, r11, 6
 ; P8LE-NEXT:    mulli r9, r9, 95
+; P8LE-NEXT:    add r10, r11, r10
+; P8LE-NEXT:    srwi r11, r3, 31
+; P8LE-NEXT:    srawi r3, r3, 6
 ; P8LE-NEXT:    mulli r10, r10, 95
-; P8LE-NEXT:    mulli r4, r4, 95
-; P8LE-NEXT:    subf r3, r8, r3
+; P8LE-NEXT:    subf r5, r8, r5
+; P8LE-NEXT:    add r3, r3, r11
+; P8LE-NEXT:    mtfprd f0, r5
+; P8LE-NEXT:    mulli r3, r3, 95
 ; P8LE-NEXT:    subf r6, r9, r6
-; P8LE-NEXT:    mtfprd f0, r3
-; P8LE-NEXT:    subf r3, r10, r7
-; P8LE-NEXT:    subf r4, r4, r5
 ; P8LE-NEXT:    mtfprd f1, r6
-; P8LE-NEXT:    mtfprd f2, r3
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    mtfprd f3, r4
+; P8LE-NEXT:    subf r5, r10, r7
+; P8LE-NEXT:    mtfprd f2, r5
 ; P8LE-NEXT:    xxswapd v3, vs1
+; P8LE-NEXT:    subf r3, r3, r4
+; P8LE-NEXT:    mtfprd f3, r3
 ; P8LE-NEXT:    xxswapd v4, vs2
-; P8LE-NEXT:    xxswapd v5, vs3
 ; P8LE-NEXT:    vmrglh v2, v3, v2
+; P8LE-NEXT:    xxswapd v5, vs3
 ; P8LE-NEXT:    vmrglh v3, v5, v4
 ; P8LE-NEXT:    vmrglw v2, v3, v2
 ; P8LE-NEXT:    blr
@@ -491,29 +433,21 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
 ; P8BE-NEXT:    extsh r5, r5
 ; P8BE-NEXT:    rldicl r7, r4, 32, 48
 ; P8BE-NEXT:    extsh r6, r6
-; P8BE-NEXT:    extsw r5, r5
+; P8BE-NEXT:    mulhw r8, r5, r3
 ; P8BE-NEXT:    rldicl r4, r4, 16, 48
 ; P8BE-NEXT:    extsh r7, r7
-; P8BE-NEXT:    extsw r6, r6
-; P8BE-NEXT:    mulld r8, r5, r3
+; P8BE-NEXT:    mulhw r9, r6, r3
 ; P8BE-NEXT:    extsh r4, r4
-; P8BE-NEXT:    extsw r7, r7
-; P8BE-NEXT:    mulld r9, r6, r3
-; P8BE-NEXT:    extsw r4, r4
-; P8BE-NEXT:    mulld r10, r7, r3
-; P8BE-NEXT:    mulld r3, r4, r3
-; P8BE-NEXT:    rldicl r8, r8, 32, 32
-; P8BE-NEXT:    rldicl r9, r9, 32, 32
+; P8BE-NEXT:    mulhw r10, r7, r3
+; P8BE-NEXT:    mulhw r3, r4, r3
 ; P8BE-NEXT:    add r8, r8, r5
-; P8BE-NEXT:    rldicl r10, r10, 32, 32
 ; P8BE-NEXT:    add r9, r9, r6
 ; P8BE-NEXT:    srwi r11, r8, 31
 ; P8BE-NEXT:    srawi r8, r8, 6
-; P8BE-NEXT:    rldicl r3, r3, 32, 32
 ; P8BE-NEXT:    add r10, r10, r7
+; P8BE-NEXT:    add r3, r3, r4
 ; P8BE-NEXT:    add r8, r8, r11
 ; P8BE-NEXT:    srwi r11, r9, 31
-; P8BE-NEXT:    add r3, r3, r4
 ; P8BE-NEXT:    srawi r9, r9, 6
 ; P8BE-NEXT:    mulli r8, r8, 95
 ; P8BE-NEXT:    add r9, r9, r11
@@ -553,12 +487,10 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 0
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    extsh r4, r3
 ; P9LE-NEXT:    lis r5, -21386
 ; P9LE-NEXT:    ori r5, r5, 37253
-; P9LE-NEXT:    extsw r4, r4
-; P9LE-NEXT:    mulld r6, r4, r5
-; P9LE-NEXT:    rldicl r6, r6, 32, 32
+; P9LE-NEXT:    extsh r4, r3
+; P9LE-NEXT:    mulhw r6, r4, r5
 ; P9LE-NEXT:    add r4, r6, r4
 ; P9LE-NEXT:    srwi r6, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 6
@@ -569,9 +501,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 2
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r6, r3
-; P9LE-NEXT:    extsw r6, r6
-; P9LE-NEXT:    mulld r7, r6, r5
-; P9LE-NEXT:    rldicl r7, r7, 32, 32
+; P9LE-NEXT:    mulhw r7, r6, r5
 ; P9LE-NEXT:    add r6, r7, r6
 ; P9LE-NEXT:    srwi r7, r6, 31
 ; P9LE-NEXT:    srawi r6, r6, 6
@@ -583,9 +513,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 4
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r7, r3
-; P9LE-NEXT:    extsw r7, r7
-; P9LE-NEXT:    mulld r8, r7, r5
-; P9LE-NEXT:    rldicl r8, r8, 32, 32
+; P9LE-NEXT:    mulhw r8, r7, r5
 ; P9LE-NEXT:    add r7, r8, r7
 ; P9LE-NEXT:    srwi r8, r7, 31
 ; P9LE-NEXT:    srawi r7, r7, 6
@@ -597,9 +525,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r8, r3
-; P9LE-NEXT:    extsw r8, r8
-; P9LE-NEXT:    mulld r5, r8, r5
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
+; P9LE-NEXT:    mulhw r5, r8, r5
 ; P9LE-NEXT:    add r5, r5, r8
 ; P9LE-NEXT:    srwi r8, r5, 31
 ; P9LE-NEXT:    srawi r5, r5, 6
@@ -630,12 +556,10 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    extsh r4, r3
 ; P9BE-NEXT:    lis r5, -21386
 ; P9BE-NEXT:    ori r5, r5, 37253
-; P9BE-NEXT:    extsw r4, r4
-; P9BE-NEXT:    mulld r6, r4, r5
-; P9BE-NEXT:    rldicl r6, r6, 32, 32
+; P9BE-NEXT:    extsh r4, r3
+; P9BE-NEXT:    mulhw r6, r4, r5
 ; P9BE-NEXT:    add r4, r6, r4
 ; P9BE-NEXT:    srwi r6, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 6
@@ -647,9 +571,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 4
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r6, r3
-; P9BE-NEXT:    extsw r6, r6
-; P9BE-NEXT:    mulld r7, r6, r5
-; P9BE-NEXT:    rldicl r7, r7, 32, 32
+; P9BE-NEXT:    mulhw r7, r6, r5
 ; P9BE-NEXT:    add r6, r7, r6
 ; P9BE-NEXT:    srwi r7, r6, 31
 ; P9BE-NEXT:    srawi r6, r6, 6
@@ -661,9 +583,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 2
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r7, r3
-; P9BE-NEXT:    extsw r7, r7
-; P9BE-NEXT:    mulld r8, r7, r5
-; P9BE-NEXT:    rldicl r8, r8, 32, 32
+; P9BE-NEXT:    mulhw r8, r7, r5
 ; P9BE-NEXT:    add r7, r8, r7
 ; P9BE-NEXT:    srwi r8, r7, 31
 ; P9BE-NEXT:    srawi r7, r7, 6
@@ -676,9 +596,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 0
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
-; P9BE-NEXT:    mulld r5, r3, r5
-; P9BE-NEXT:    rldicl r5, r5, 32, 32
+; P9BE-NEXT:    mulhw r5, r3, r5
 ; P9BE-NEXT:    add r5, r5, r3
 ; P9BE-NEXT:    srwi r8, r5, 31
 ; P9BE-NEXT:    srawi r5, r5, 6
@@ -706,66 +624,58 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ; P8LE-LABEL: combine_srem_sdiv:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r5, -21386
+; P8LE-NEXT:    lis r4, -21386
 ; P8LE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT:    ori r5, r5, 37253
-; P8LE-NEXT:    mffprd r6, f0
-; P8LE-NEXT:    clrldi r3, r6, 48
-; P8LE-NEXT:    rldicl r4, r6, 48, 48
-; P8LE-NEXT:    rldicl r7, r6, 32, 48
+; P8LE-NEXT:    ori r4, r4, 37253
+; P8LE-NEXT:    mffprd r5, f0
+; P8LE-NEXT:    clrldi r3, r5, 48
+; P8LE-NEXT:    rldicl r6, r5, 48, 48
+; P8LE-NEXT:    rldicl r7, r5, 32, 48
 ; P8LE-NEXT:    extsh r8, r3
-; P8LE-NEXT:    extsh r9, r4
-; P8LE-NEXT:    rldicl r6, r6, 16, 48
+; P8LE-NEXT:    extsh r9, r6
 ; P8LE-NEXT:    extsh r10, r7
-; P8LE-NEXT:    extsw r8, r8
-; P8LE-NEXT:    extsw r9, r9
-; P8LE-NEXT:    extsh r11, r6
-; P8LE-NEXT:    extsw r10, r10
-; P8LE-NEXT:    mulld r12, r8, r5
-; P8LE-NEXT:    extsw r11, r11
-; P8LE-NEXT:    mulld r0, r9, r5
-; P8LE-NEXT:    mulld r30, r10, r5
-; P8LE-NEXT:    mulld r5, r11, r5
-; P8LE-NEXT:    rldicl r12, r12, 32, 32
-; P8LE-NEXT:    rldicl r0, r0, 32, 32
-; P8LE-NEXT:    rldicl r30, r30, 32, 32
-; P8LE-NEXT:    add r8, r12, r8
-; P8LE-NEXT:    rldicl r5, r5, 32, 32
-; P8LE-NEXT:    add r9, r0, r9
-; P8LE-NEXT:    add r10, r30, r10
-; P8LE-NEXT:    srwi r12, r8, 31
-; P8LE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; P8LE-NEXT:    mulhw r11, r8, r4
+; P8LE-NEXT:    rldicl r5, r5, 16, 48
+; P8LE-NEXT:    mulhw r12, r9, r4
+; P8LE-NEXT:    mulhw r0, r10, r4
+; P8LE-NEXT:    extsh r30, r5
+; P8LE-NEXT:    mulhw r4, r30, r4
+; P8LE-NEXT:    add r8, r11, r8
+; P8LE-NEXT:    add r9, r12, r9
+; P8LE-NEXT:    srwi r11, r8, 31
+; P8LE-NEXT:    add r10, r0, r10
 ; P8LE-NEXT:    srawi r8, r8, 6
-; P8LE-NEXT:    srawi r0, r9, 6
+; P8LE-NEXT:    srawi r12, r9, 6
 ; P8LE-NEXT:    srwi r9, r9, 31
-; P8LE-NEXT:    add r5, r5, r11
-; P8LE-NEXT:    add r8, r8, r12
-; P8LE-NEXT:    srawi r12, r10, 6
+; P8LE-NEXT:    add r8, r8, r11
+; P8LE-NEXT:    add r4, r4, r30
+; P8LE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; P8LE-NEXT:    srawi r11, r10, 6
 ; P8LE-NEXT:    srwi r10, r10, 31
-; P8LE-NEXT:    add r9, r0, r9
-; P8LE-NEXT:    mulli r0, r8, 95
-; P8LE-NEXT:    add r10, r12, r10
+; P8LE-NEXT:    add r9, r12, r9
 ; P8LE-NEXT:    mtfprd f0, r8
-; P8LE-NEXT:    srwi r8, r5, 31
-; P8LE-NEXT:    srawi r5, r5, 6
-; P8LE-NEXT:    mulli r11, r9, 95
+; P8LE-NEXT:    mulli r12, r8, 95
+; P8LE-NEXT:    add r10, r11, r10
+; P8LE-NEXT:    srwi r8, r4, 31
 ; P8LE-NEXT:    mtfprd f1, r9
-; P8LE-NEXT:    mulli r9, r10, 95
-; P8LE-NEXT:    add r5, r5, r8
+; P8LE-NEXT:    srawi r4, r4, 6
+; P8LE-NEXT:    mulli r11, r9, 95
 ; P8LE-NEXT:    xxswapd v2, vs0
 ; P8LE-NEXT:    mtfprd f2, r10
-; P8LE-NEXT:    mtfprd f3, r5
-; P8LE-NEXT:    mulli r5, r5, 95
+; P8LE-NEXT:    mulli r9, r10, 95
+; P8LE-NEXT:    add r4, r4, r8
 ; P8LE-NEXT:    xxswapd v3, vs1
-; P8LE-NEXT:    subf r3, r0, r3
+; P8LE-NEXT:    mtfprd f3, r4
+; P8LE-NEXT:    mulli r4, r4, 95
 ; P8LE-NEXT:    xxswapd v1, vs2
+; P8LE-NEXT:    subf r3, r12, r3
 ; P8LE-NEXT:    mtfprd f0, r3
-; P8LE-NEXT:    subf r4, r11, r4
+; P8LE-NEXT:    subf r6, r11, r6
 ; P8LE-NEXT:    xxswapd v6, vs3
 ; P8LE-NEXT:    subf r3, r9, r7
-; P8LE-NEXT:    mtfprd f1, r4
+; P8LE-NEXT:    mtfprd f1, r6
 ; P8LE-NEXT:    mtfprd f4, r3
-; P8LE-NEXT:    subf r3, r5, r6
+; P8LE-NEXT:    subf r3, r4, r5
 ; P8LE-NEXT:    mtfprd f5, r3
 ; P8LE-NEXT:    xxswapd v4, vs1
 ; P8LE-NEXT:    vmrglh v2, v3, v2
@@ -782,69 +692,61 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
 ;
 ; P8BE-LABEL: combine_srem_sdiv:
 ; P8BE:       # %bb.0:
-; P8BE-NEXT:    mfvsrd r6, v2
-; P8BE-NEXT:    lis r5, -21386
-; P8BE-NEXT:    ori r5, r5, 37253
-; P8BE-NEXT:    clrldi r3, r6, 48
-; P8BE-NEXT:    rldicl r4, r6, 48, 48
+; P8BE-NEXT:    mfvsrd r5, v2
+; P8BE-NEXT:    lis r4, -21386
+; P8BE-NEXT:    ori r4, r4, 37253
+; P8BE-NEXT:    clrldi r3, r5, 48
+; P8BE-NEXT:    rldicl r6, r5, 48, 48
 ; P8BE-NEXT:    extsh r8, r3
-; P8BE-NEXT:    rldicl r7, r6, 32, 48
-; P8BE-NEXT:    extsh r9, r4
-; P8BE-NEXT:    rldicl r6, r6, 16, 48
-; P8BE-NEXT:    extsw r8, r8
+; P8BE-NEXT:    rldicl r7, r5, 32, 48
+; P8BE-NEXT:    extsh r9, r6
+; P8BE-NEXT:    rldicl r5, r5, 16, 48
+; P8BE-NEXT:    mulhw r11, r8, r4
 ; P8BE-NEXT:    extsh r10, r7
-; P8BE-NEXT:    extsw r9, r9
-; P8BE-NEXT:    extsh r6, r6
-; P8BE-NEXT:    mulld r11, r8, r5
-; P8BE-NEXT:    extsw r10, r10
-; P8BE-NEXT:    extsw r6, r6
-; P8BE-NEXT:    mulld r12, r9, r5
-; P8BE-NEXT:    mulld r0, r10, r5
-; P8BE-NEXT:    mulld r5, r6, r5
-; P8BE-NEXT:    rldicl r11, r11, 32, 32
-; P8BE-NEXT:    rldicl r12, r12, 32, 32
+; P8BE-NEXT:    extsh r5, r5
+; P8BE-NEXT:    mulhw r12, r9, r4
+; P8BE-NEXT:    mulhw r0, r10, r4
+; P8BE-NEXT:    mulhw r4, r5, r4
 ; P8BE-NEXT:    add r8, r11, r8
-; P8BE-NEXT:    rldicl r0, r0, 32, 32
-; P8BE-NEXT:    rldicl r5, r5, 32, 32
 ; P8BE-NEXT:    add r9, r12, r9
 ; P8BE-NEXT:    srawi r11, r8, 6
 ; P8BE-NEXT:    srwi r8, r8, 31
 ; P8BE-NEXT:    add r10, r0, r10
-; P8BE-NEXT:    add r5, r5, r6
+; P8BE-NEXT:    add r4, r4, r5
+; P8BE-NEXT:    add r8, r11, r8
 ; P8BE-NEXT:    srawi r12, r9, 6
 ; P8BE-NEXT:    srwi r9, r9, 31
-; P8BE-NEXT:    add r8, r11, r8
 ; P8BE-NEXT:    srawi r0, r10, 6
-; P8BE-NEXT:    srawi r11, r5, 6
+; P8BE-NEXT:    srawi r11, r4, 6
 ; P8BE-NEXT:    srwi r10, r10, 31
 ; P8BE-NEXT:    add r9, r12, r9
-; P8BE-NEXT:    srwi r5, r5, 31
+; P8BE-NEXT:    srwi r4, r4, 31
 ; P8BE-NEXT:    mulli r12, r8, 95
 ; P8BE-NEXT:    add r10, r0, r10
-; P8BE-NEXT:    add r5, r11, r5
+; P8BE-NEXT:    add r4, r11, r4
 ; P8BE-NEXT:    mulli r0, r9, 95
 ; P8BE-NEXT:    sldi r9, r9, 48
 ; P8BE-NEXT:    sldi r8, r8, 48
 ; P8BE-NEXT:    mtvsrd v3, r9
-; P8BE-NEXT:    mulli r9, r5, 95
+; P8BE-NEXT:    mulli r9, r4, 95
 ; P8BE-NEXT:    mtvsrd v2, r8
 ; P8BE-NEXT:    mulli r8, r10, 95
 ; P8BE-NEXT:    sldi r10, r10, 48
 ; P8BE-NEXT:    subf r3, r12, r3
 ; P8BE-NEXT:    mtvsrd v4, r10
-; P8BE-NEXT:    subf r4, r0, r4
+; P8BE-NEXT:    subf r6, r0, r6
 ; P8BE-NEXT:    sldi r3, r3, 48
 ; P8BE-NEXT:    vmrghh v2, v3, v2
-; P8BE-NEXT:    sldi r4, r4, 48
+; P8BE-NEXT:    sldi r6, r6, 48
 ; P8BE-NEXT:    mtvsrd v3, r3
-; P8BE-NEXT:    subf r3, r9, r6
+; P8BE-NEXT:    subf r3, r9, r5
 ; P8BE-NEXT:    subf r7, r8, r7
-; P8BE-NEXT:    mtvsrd v5, r4
+; P8BE-NEXT:    mtvsrd v5, r6
 ; P8BE-NEXT:    sldi r3, r3, 48
-; P8BE-NEXT:    sldi r6, r7, 48
+; P8BE-NEXT:    sldi r5, r7, 48
 ; P8BE-NEXT:    mtvsrd v1, r3
-; P8BE-NEXT:    sldi r3, r5, 48
-; P8BE-NEXT:    mtvsrd v0, r6
+; P8BE-NEXT:    sldi r3, r4, 48
+; P8BE-NEXT:    mtvsrd v0, r5
 ; P8BE-NEXT:    vmrghh v3, v5, v3
 ; P8BE-NEXT:    mtvsrd v5, r3
 ; P8BE-NEXT:    vmrghh v0, v1, v0
@@ -882,14 +784,11 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    extsh r4, r3
 ; P9LE-NEXT:    lis r5, -21386
 ; P9LE-NEXT:    ori r5, r5, 37253
 ; P9LE-NEXT:    xxswapd v4, vs0
-; P9LE-NEXT:    vmrglh v3, v4, v3
-; P9LE-NEXT:    extsw r4, r4
-; P9LE-NEXT:    mulld r5, r4, r5
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
+; P9LE-NEXT:    extsh r4, r3
+; P9LE-NEXT:    mulhw r5, r4, r5
 ; P9LE-NEXT:    add r4, r5, r4
 ; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 6
@@ -904,6 +803,7 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
 ; P9LE-NEXT:    addze r4, r4
 ; P9LE-NEXT:    slwi r4, r4, 3
 ; P9LE-NEXT:    subf r3, r4, r3
+; P9LE-NEXT:    vmrglh v3, v4, v3
 ; P9LE-NEXT:    xxswapd v4, vs0
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    xxswapd v2, vs0
@@ -935,10 +835,8 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
 ; P9BE-NEXT:    ori r4, r4, 37253
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    mulhw r4, r3, r4
 ; P9BE-NEXT:    add r4, r4, r3
 ; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 6
@@ -971,30 +869,28 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
 ; P8LE-NEXT:    clrldi r7, r4, 48
 ; P8LE-NEXT:    extsh r6, r5
 ; P8LE-NEXT:    extsh r8, r7
-; P8LE-NEXT:    extsw r6, r6
+; P8LE-NEXT:    mulhw r3, r6, r3
 ; P8LE-NEXT:    rldicl r9, r4, 48, 48
-; P8LE-NEXT:    mulld r3, r6, r3
 ; P8LE-NEXT:    srawi r8, r8, 6
 ; P8LE-NEXT:    extsh r10, r9
 ; P8LE-NEXT:    addze r8, r8
 ; P8LE-NEXT:    rldicl r4, r4, 32, 48
 ; P8LE-NEXT:    srawi r10, r10, 5
 ; P8LE-NEXT:    slwi r8, r8, 6
-; P8LE-NEXT:    subf r7, r8, r7
-; P8LE-NEXT:    rldicl r3, r3, 32, 32
-; P8LE-NEXT:    mtfprd f0, r7
 ; P8LE-NEXT:    add r3, r3, r6
 ; P8LE-NEXT:    addze r6, r10
+; P8LE-NEXT:    subf r7, r8, r7
 ; P8LE-NEXT:    srwi r10, r3, 31
 ; P8LE-NEXT:    srawi r3, r3, 6
+; P8LE-NEXT:    mtfprd f0, r7
 ; P8LE-NEXT:    slwi r6, r6, 5
-; P8LE-NEXT:    xxswapd v2, vs0
 ; P8LE-NEXT:    add r3, r3, r10
 ; P8LE-NEXT:    extsh r10, r4
 ; P8LE-NEXT:    subf r6, r6, r9
 ; P8LE-NEXT:    mulli r3, r3, 95
 ; P8LE-NEXT:    srawi r8, r10, 3
 ; P8LE-NEXT:    mtfprd f1, r6
+; P8LE-NEXT:    xxswapd v2, vs0
 ; P8LE-NEXT:    addze r7, r8
 ; P8LE-NEXT:    xxswapd v3, vs1
 ; P8LE-NEXT:    subf r3, r3, r5
@@ -1018,9 +914,8 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
 ; P8BE-NEXT:    rldicl r6, r4, 32, 48
 ; P8BE-NEXT:    extsh r5, r5
 ; P8BE-NEXT:    extsh r6, r6
-; P8BE-NEXT:    extsw r5, r5
+; P8BE-NEXT:    mulhw r3, r5, r3
 ; P8BE-NEXT:    rldicl r7, r4, 16, 48
-; P8BE-NEXT:    mulld r3, r5, r3
 ; P8BE-NEXT:    srawi r8, r6, 5
 ; P8BE-NEXT:    extsh r7, r7
 ; P8BE-NEXT:    addze r8, r8
@@ -1028,16 +923,15 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
 ; P8BE-NEXT:    srawi r9, r7, 6
 ; P8BE-NEXT:    extsh r4, r4
 ; P8BE-NEXT:    slwi r8, r8, 5
+; P8BE-NEXT:    add r3, r3, r5
 ; P8BE-NEXT:    addze r9, r9
 ; P8BE-NEXT:    subf r6, r8, r6
-; P8BE-NEXT:    rldicl r3, r3, 32, 32
-; P8BE-NEXT:    slwi r8, r9, 6
-; P8BE-NEXT:    add r3, r3, r5
-; P8BE-NEXT:    subf r7, r8, r7
 ; P8BE-NEXT:    srwi r10, r3, 31
 ; P8BE-NEXT:    srawi r3, r3, 6
+; P8BE-NEXT:    slwi r8, r9, 6
 ; P8BE-NEXT:    add r3, r3, r10
 ; P8BE-NEXT:    srawi r9, r4, 3
+; P8BE-NEXT:    subf r7, r8, r7
 ; P8BE-NEXT:    mulli r3, r3, 95
 ; P8BE-NEXT:    sldi r6, r6, 48
 ; P8BE-NEXT:    addze r8, r9
@@ -1065,13 +959,10 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 2
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    extsh r4, r3
 ; P9LE-NEXT:    lis r5, -14230
 ; P9LE-NEXT:    ori r5, r5, 30865
-; P9LE-NEXT:    extsw r4, r4
-; P9LE-NEXT:    mulld r5, r4, r5
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
-; P9LE-NEXT:    xxlxor v4, v4, v4
+; P9LE-NEXT:    extsh r4, r3
+; P9LE-NEXT:    mulhw r5, r4, r5
 ; P9LE-NEXT:    add r4, r5, r4
 ; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 9
@@ -1081,12 +972,11 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; P9LE-NEXT:    subf r3, r4, r3
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    li r3, 4
+; P9LE-NEXT:    ori r5, r5, 17097
+; P9LE-NEXT:    xxlxor v3, v3, v3
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r4, r3
-; P9LE-NEXT:    extsw r4, r4
-; P9LE-NEXT:    ori r5, r5, 17097
-; P9LE-NEXT:    mulld r5, r4, r5
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
+; P9LE-NEXT:    mulhw r5, r4, r5
 ; P9LE-NEXT:    add r4, r5, r4
 ; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 4
@@ -1094,21 +984,19 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; P9LE-NEXT:    lis r5, 24749
 ; P9LE-NEXT:    mulli r4, r4, 23
 ; P9LE-NEXT:    subf r3, r4, r3
-; P9LE-NEXT:    xxswapd v3, vs0
+; P9LE-NEXT:    xxswapd v4, vs0
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r4, r3
-; P9LE-NEXT:    extsw r4, r4
 ; P9LE-NEXT:    ori r5, r5, 47143
-; P9LE-NEXT:    mulld r4, r4, r5
-; P9LE-NEXT:    rldicl r5, r4, 1, 63
-; P9LE-NEXT:    rldicl r4, r4, 32, 32
+; P9LE-NEXT:    mulhw r4, r4, r5
+; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 11
 ; P9LE-NEXT:    add r4, r4, r5
 ; P9LE-NEXT:    mulli r4, r4, 5423
 ; P9LE-NEXT:    subf r3, r4, r3
-; P9LE-NEXT:    vmrglh v3, v3, v4
+; P9LE-NEXT:    vmrglh v3, v4, v3
 ; P9LE-NEXT:    xxswapd v4, vs0
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    xxswapd v2, vs0
@@ -1120,12 +1008,10 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 4
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    extsh r3, r3
 ; P9BE-NEXT:    lis r4, -19946
 ; P9BE-NEXT:    ori r4, r4, 17097
-; P9BE-NEXT:    extsw r3, r3
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    extsh r3, r3
+; P9BE-NEXT:    mulhw r4, r3, r4
 ; P9BE-NEXT:    add r4, r4, r3
 ; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 4
@@ -1138,11 +1024,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
 ; P9BE-NEXT:    ori r4, r4, 47143
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r5, r4, 1, 63
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    mulhw r4, r3, r4
+; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 11
 ; P9BE-NEXT:    add r4, r4, r5
 ; P9BE-NEXT:    mulli r4, r4, 5423
@@ -1153,10 +1037,8 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 2
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
 ; P9BE-NEXT:    ori r4, r4, 30865
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    mulhw r4, r3, r4
 ; P9BE-NEXT:    add r4, r4, r3
 ; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 9
@@ -1177,46 +1059,40 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
 ; P8LE-NEXT:    lis r3, 24749
-; P8LE-NEXT:    lis r8, -19946
-; P8LE-NEXT:    lis r10, -14230
+; P8LE-NEXT:    lis r7, -19946
+; P8LE-NEXT:    lis r9, -14230
 ; P8LE-NEXT:    xxlxor v5, v5, v5
 ; P8LE-NEXT:    ori r3, r3, 47143
-; P8LE-NEXT:    ori r8, r8, 17097
+; P8LE-NEXT:    ori r7, r7, 17097
 ; P8LE-NEXT:    mffprd r4, f0
 ; P8LE-NEXT:    rldicl r5, r4, 16, 48
 ; P8LE-NEXT:    rldicl r6, r4, 32, 48
 ; P8LE-NEXT:    rldicl r4, r4, 48, 48
-; P8LE-NEXT:    extsh r7, r5
-; P8LE-NEXT:    extsh r9, r6
-; P8LE-NEXT:    extsw r7, r7
-; P8LE-NEXT:    extsh r11, r4
-; P8LE-NEXT:    extsw r9, r9
-; P8LE-NEXT:    mulld r3, r7, r3
-; P8LE-NEXT:    ori r7, r10, 30865
-; P8LE-NEXT:    extsw r10, r11
-; P8LE-NEXT:    mulld r8, r9, r8
-; P8LE-NEXT:    mulld r7, r10, r7
-; P8LE-NEXT:    rldicl r11, r3, 1, 63
-; P8LE-NEXT:    rldicl r3, r3, 32, 32
-; P8LE-NEXT:    rldicl r8, r8, 32, 32
-; P8LE-NEXT:    rldicl r7, r7, 32, 32
-; P8LE-NEXT:    add r8, r8, r9
-; P8LE-NEXT:    srawi r3, r3, 11
+; P8LE-NEXT:    extsh r8, r5
+; P8LE-NEXT:    extsh r10, r6
+; P8LE-NEXT:    mulhw r3, r8, r3
+; P8LE-NEXT:    ori r8, r9, 30865
+; P8LE-NEXT:    extsh r9, r4
+; P8LE-NEXT:    mulhw r7, r10, r7
+; P8LE-NEXT:    mulhw r8, r9, r8
 ; P8LE-NEXT:    add r7, r7, r10
-; P8LE-NEXT:    srwi r9, r8, 31
-; P8LE-NEXT:    srawi r8, r8, 4
-; P8LE-NEXT:    add r3, r3, r11
+; P8LE-NEXT:    srwi r10, r3, 31
 ; P8LE-NEXT:    add r8, r8, r9
+; P8LE-NEXT:    srawi r3, r3, 11
 ; P8LE-NEXT:    srwi r9, r7, 31
-; P8LE-NEXT:    srawi r7, r7, 9
-; P8LE-NEXT:    mulli r3, r3, 5423
+; P8LE-NEXT:    srawi r7, r7, 4
+; P8LE-NEXT:    add r3, r3, r10
 ; P8LE-NEXT:    add r7, r7, r9
-; P8LE-NEXT:    mulli r8, r8, 23
-; P8LE-NEXT:    mulli r7, r7, 654
+; P8LE-NEXT:    srwi r9, r8, 31
+; P8LE-NEXT:    srawi r8, r8, 9
+; P8LE-NEXT:    mulli r3, r3, 5423
+; P8LE-NEXT:    add r8, r8, r9
+; P8LE-NEXT:    mulli r7, r7, 23
+; P8LE-NEXT:    mulli r8, r8, 654
 ; P8LE-NEXT:    subf r3, r3, r5
 ; P8LE-NEXT:    mtfprd f0, r3
-; P8LE-NEXT:    subf r3, r8, r6
-; P8LE-NEXT:    subf r4, r7, r4
+; P8LE-NEXT:    subf r3, r7, r6
+; P8LE-NEXT:    subf r4, r8, r4
 ; P8LE-NEXT:    mtfprd f1, r3
 ; P8LE-NEXT:    mtfprd f2, r4
 ; P8LE-NEXT:    xxswapd v2, vs0
@@ -1229,54 +1105,48 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
 ;
 ; P8BE-LABEL: dont_fold_srem_one:
 ; P8BE:       # %bb.0:
-; P8BE-NEXT:    mfvsrd r4, v2
-; P8BE-NEXT:    lis r3, 24749
-; P8BE-NEXT:    lis r7, -19946
+; P8BE-NEXT:    mfvsrd r3, v2
+; P8BE-NEXT:    lis r5, 24749
+; P8BE-NEXT:    lis r6, -19946
 ; P8BE-NEXT:    lis r8, -14230
-; P8BE-NEXT:    ori r3, r3, 47143
-; P8BE-NEXT:    ori r7, r7, 17097
+; P8BE-NEXT:    ori r5, r5, 47143
+; P8BE-NEXT:    ori r6, r6, 17097
 ; P8BE-NEXT:    ori r8, r8, 30865
-; P8BE-NEXT:    clrldi r5, r4, 48
-; P8BE-NEXT:    rldicl r6, r4, 48, 48
-; P8BE-NEXT:    rldicl r4, r4, 32, 48
-; P8BE-NEXT:    extsh r5, r5
-; P8BE-NEXT:    extsh r6, r6
+; P8BE-NEXT:    clrldi r4, r3, 48
+; P8BE-NEXT:    rldicl r7, r3, 48, 48
+; P8BE-NEXT:    rldicl r3, r3, 32, 48
 ; P8BE-NEXT:    extsh r4, r4
-; P8BE-NEXT:    extsw r5, r5
-; P8BE-NEXT:    extsw r6, r6
-; P8BE-NEXT:    extsw r4, r4
-; P8BE-NEXT:    mulld r3, r5, r3
-; P8BE-NEXT:    mulld r7, r6, r7
-; P8BE-NEXT:    mulld r8, r4, r8
-; P8BE-NEXT:    rldicl r9, r3, 1, 63
-; P8BE-NEXT:    rldicl r3, r3, 32, 32
-; P8BE-NEXT:    rldicl r7, r7, 32, 32
-; P8BE-NEXT:    rldicl r8, r8, 32, 32
-; P8BE-NEXT:    srawi r3, r3, 11
-; P8BE-NEXT:    add r7, r7, r6
-; P8BE-NEXT:    add r8, r8, r4
-; P8BE-NEXT:    add r3, r3, r9
-; P8BE-NEXT:    srwi r9, r7, 31
-; P8BE-NEXT:    srawi r7, r7, 4
-; P8BE-NEXT:    mulli r3, r3, 5423
-; P8BE-NEXT:    add r7, r7, r9
+; P8BE-NEXT:    extsh r7, r7
+; P8BE-NEXT:    extsh r3, r3
+; P8BE-NEXT:    mulhw r5, r4, r5
+; P8BE-NEXT:    mulhw r6, r7, r6
+; P8BE-NEXT:    mulhw r8, r3, r8
+; P8BE-NEXT:    srwi r9, r5, 31
+; P8BE-NEXT:    srawi r5, r5, 11
+; P8BE-NEXT:    add r6, r6, r7
+; P8BE-NEXT:    add r8, r8, r3
+; P8BE-NEXT:    add r5, r5, r9
+; P8BE-NEXT:    srwi r9, r6, 31
+; P8BE-NEXT:    srawi r6, r6, 4
+; P8BE-NEXT:    add r6, r6, r9
 ; P8BE-NEXT:    srwi r9, r8, 31
 ; P8BE-NEXT:    srawi r8, r8, 9
-; P8BE-NEXT:    mulli r7, r7, 23
+; P8BE-NEXT:    mulli r5, r5, 5423
 ; P8BE-NEXT:    add r8, r8, r9
+; P8BE-NEXT:    mulli r6, r6, 23
 ; P8BE-NEXT:    li r9, 0
 ; P8BE-NEXT:    mulli r8, r8, 654
-; P8BE-NEXT:    subf r3, r3, r5
+; P8BE-NEXT:    subf r4, r5, r4
 ; P8BE-NEXT:    sldi r5, r9, 48
-; P8BE-NEXT:    sldi r3, r3, 48
 ; P8BE-NEXT:    mtvsrd v2, r5
-; P8BE-NEXT:    subf r5, r7, r6
-; P8BE-NEXT:    mtvsrd v3, r3
-; P8BE-NEXT:    sldi r3, r5, 48
-; P8BE-NEXT:    subf r4, r8, r4
-; P8BE-NEXT:    mtvsrd v4, r3
+; P8BE-NEXT:    subf r5, r6, r7
 ; P8BE-NEXT:    sldi r4, r4, 48
-; P8BE-NEXT:    mtvsrd v5, r4
+; P8BE-NEXT:    subf r3, r8, r3
+; P8BE-NEXT:    mtvsrd v3, r4
+; P8BE-NEXT:    sldi r4, r5, 48
+; P8BE-NEXT:    sldi r3, r3, 48
+; P8BE-NEXT:    mtvsrd v4, r4
+; P8BE-NEXT:    mtvsrd v5, r3
 ; P8BE-NEXT:    vmrghh v3, v4, v3
 ; P8BE-NEXT:    vmrghh v2, v2, v5
 ; P8BE-NEXT:    vmrghw v2, v2, v3
@@ -1291,12 +1161,10 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 4
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    extsh r4, r3
 ; P9LE-NEXT:    lis r5, -19946
 ; P9LE-NEXT:    ori r5, r5, 17097
-; P9LE-NEXT:    extsw r4, r4
-; P9LE-NEXT:    mulld r5, r4, r5
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
+; P9LE-NEXT:    extsh r4, r3
+; P9LE-NEXT:    mulhw r5, r4, r5
 ; P9LE-NEXT:    add r4, r5, r4
 ; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 4
@@ -1308,11 +1176,9 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    extsh r4, r3
-; P9LE-NEXT:    extsw r4, r4
 ; P9LE-NEXT:    ori r5, r5, 47143
-; P9LE-NEXT:    mulld r4, r4, r5
-; P9LE-NEXT:    rldicl r5, r4, 1, 63
-; P9LE-NEXT:    rldicl r4, r4, 32, 32
+; P9LE-NEXT:    mulhw r4, r4, r5
+; P9LE-NEXT:    srwi r5, r4, 31
 ; P9LE-NEXT:    srawi r4, r4, 11
 ; P9LE-NEXT:    add r4, r4, r5
 ; P9LE-NEXT:    mulli r4, r4, 5423
@@ -1339,12 +1205,10 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 4
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    extsh r3, r3
 ; P9BE-NEXT:    lis r4, -19946
 ; P9BE-NEXT:    ori r4, r4, 17097
-; P9BE-NEXT:    extsw r3, r3
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    extsh r3, r3
+; P9BE-NEXT:    mulhw r4, r3, r4
 ; P9BE-NEXT:    add r4, r4, r3
 ; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 4
@@ -1357,11 +1221,9 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    extsh r3, r3
-; P9BE-NEXT:    extsw r3, r3
 ; P9BE-NEXT:    ori r4, r4, 47143
-; P9BE-NEXT:    mulld r4, r3, r4
-; P9BE-NEXT:    rldicl r5, r4, 1, 63
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    mulhw r4, r3, r4
+; P9BE-NEXT:    srwi r5, r4, 31
 ; P9BE-NEXT:    srawi r4, r4, 11
 ; P9BE-NEXT:    add r4, r4, r5
 ; P9BE-NEXT:    mulli r4, r4, 5423
@@ -1388,39 +1250,35 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
 ; P8LE-LABEL: dont_fold_urem_i16_smax:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r6, 24749
-; P8LE-NEXT:    lis r7, -19946
+; P8LE-NEXT:    lis r4, 24749
+; P8LE-NEXT:    lis r5, -19946
 ; P8LE-NEXT:    xxlxor v5, v5, v5
-; P8LE-NEXT:    ori r6, r6, 47143
-; P8LE-NEXT:    ori r7, r7, 17097
+; P8LE-NEXT:    ori r4, r4, 47143
+; P8LE-NEXT:    ori r5, r5, 17097
 ; P8LE-NEXT:    mffprd r3, f0
-; P8LE-NEXT:    rldicl r4, r3, 16, 48
-; P8LE-NEXT:    rldicl r5, r3, 32, 48
-; P8LE-NEXT:    extsh r8, r4
-; P8LE-NEXT:    extsh r9, r5
-; P8LE-NEXT:    extsw r8, r8
-; P8LE-NEXT:    extsw r9, r9
-; P8LE-NEXT:    mulld r6, r8, r6
-; P8LE-NEXT:    mulld r7, r9, r7
+; P8LE-NEXT:    rldicl r6, r3, 16, 48
+; P8LE-NEXT:    rldicl r7, r3, 32, 48
+; P8LE-NEXT:    extsh r8, r6
+; P8LE-NEXT:    extsh r9, r7
+; P8LE-NEXT:    mulhw r4, r8, r4
+; P8LE-NEXT:    mulhw r5, r9, r5
 ; P8LE-NEXT:    rldicl r3, r3, 48, 48
-; P8LE-NEXT:    rldicl r8, r6, 32, 32
-; P8LE-NEXT:    rldicl r7, r7, 32, 32
-; P8LE-NEXT:    rldicl r6, r6, 1, 63
-; P8LE-NEXT:    srawi r8, r8, 11
-; P8LE-NEXT:    add r7, r7, r9
-; P8LE-NEXT:    add r6, r8, r6
-; P8LE-NEXT:    srwi r8, r7, 31
-; P8LE-NEXT:    srawi r7, r7, 4
-; P8LE-NEXT:    mulli r6, r6, 5423
-; P8LE-NEXT:    add r7, r7, r8
+; P8LE-NEXT:    srwi r8, r4, 31
+; P8LE-NEXT:    srawi r4, r4, 11
+; P8LE-NEXT:    add r5, r5, r9
+; P8LE-NEXT:    add r4, r4, r8
+; P8LE-NEXT:    srwi r8, r5, 31
+; P8LE-NEXT:    srawi r5, r5, 4
+; P8LE-NEXT:    mulli r4, r4, 5423
+; P8LE-NEXT:    add r5, r5, r8
 ; P8LE-NEXT:    extsh r8, r3
-; P8LE-NEXT:    mulli r7, r7, 23
+; P8LE-NEXT:    mulli r5, r5, 23
 ; P8LE-NEXT:    srawi r8, r8, 15
-; P8LE-NEXT:    subf r4, r6, r4
+; P8LE-NEXT:    subf r4, r4, r6
 ; P8LE-NEXT:    addze r6, r8
 ; P8LE-NEXT:    mtfprd f0, r4
 ; P8LE-NEXT:    slwi r4, r6, 15
-; P8LE-NEXT:    subf r5, r7, r5
+; P8LE-NEXT:    subf r5, r5, r7
 ; P8LE-NEXT:    subf r3, r4, r3
 ; P8LE-NEXT:    mtfprd f1, r5
 ; P8LE-NEXT:    xxswapd v2, vs0
@@ -1434,47 +1292,43 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
 ;
 ; P8BE-LABEL: dont_fold_urem_i16_smax:
 ; P8BE:       # %bb.0:
-; P8BE-NEXT:    mfvsrd r4, v2
-; P8BE-NEXT:    lis r3, 24749
-; P8BE-NEXT:    lis r7, -19946
-; P8BE-NEXT:    ori r3, r3, 47143
-; P8BE-NEXT:    ori r7, r7, 17097
-; P8BE-NEXT:    clrldi r5, r4, 48
-; P8BE-NEXT:    rldicl r6, r4, 48, 48
-; P8BE-NEXT:    extsh r5, r5
+; P8BE-NEXT:    mfvsrd r3, v2
+; P8BE-NEXT:    lis r4, 24749
+; P8BE-NEXT:    lis r5, -19946
+; P8BE-NEXT:    ori r4, r4, 47143
+; P8BE-NEXT:    ori r5, r5, 17097
+; P8BE-NEXT:    clrldi r6, r3, 48
+; P8BE-NEXT:    rldicl r7, r3, 48, 48
 ; P8BE-NEXT:    extsh r6, r6
-; P8BE-NEXT:    extsw r5, r5
-; P8BE-NEXT:    extsw r6, r6
-; P8BE-NEXT:    mulld r3, r5, r3
-; P8BE-NEXT:    mulld r7, r6, r7
-; P8BE-NEXT:    rldicl r4, r4, 32, 48
-; P8BE-NEXT:    extsh r4, r4
-; P8BE-NEXT:    rldicl r8, r3, 1, 63
-; P8BE-NEXT:    rldicl r3, r3, 32, 32
-; P8BE-NEXT:    rldicl r7, r7, 32, 32
-; P8BE-NEXT:    srawi r3, r3, 11
-; P8BE-NEXT:    add r7, r7, r6
-; P8BE-NEXT:    add r3, r3, r8
-; P8BE-NEXT:    srwi r8, r7, 31
-; P8BE-NEXT:    srawi r7, r7, 4
-; P8BE-NEXT:    mulli r3, r3, 5423
-; P8BE-NEXT:    add r7, r7, r8
+; P8BE-NEXT:    extsh r7, r7
+; P8BE-NEXT:    mulhw r4, r6, r4
+; P8BE-NEXT:    mulhw r5, r7, r5
+; P8BE-NEXT:    rldicl r3, r3, 32, 48
+; P8BE-NEXT:    extsh r3, r3
+; P8BE-NEXT:    srwi r8, r4, 31
+; P8BE-NEXT:    srawi r4, r4, 11
+; P8BE-NEXT:    add r5, r5, r7
+; P8BE-NEXT:    add r4, r4, r8
+; P8BE-NEXT:    srwi r8, r5, 31
+; P8BE-NEXT:    srawi r5, r5, 4
+; P8BE-NEXT:    mulli r4, r4, 5423
+; P8BE-NEXT:    add r5, r5, r8
 ; P8BE-NEXT:    li r8, 0
-; P8BE-NEXT:    mulli r7, r7, 23
-; P8BE-NEXT:    srawi r9, r4, 15
-; P8BE-NEXT:    subf r3, r3, r5
-; P8BE-NEXT:    sldi r5, r8, 48
+; P8BE-NEXT:    mulli r5, r5, 23
+; P8BE-NEXT:    srawi r9, r3, 15
+; P8BE-NEXT:    subf r4, r4, r6
+; P8BE-NEXT:    sldi r6, r8, 48
 ; P8BE-NEXT:    addze r8, r9
-; P8BE-NEXT:    mtvsrd v2, r5
-; P8BE-NEXT:    subf r5, r7, r6
+; P8BE-NEXT:    mtvsrd v2, r6
 ; P8BE-NEXT:    slwi r6, r8, 15
-; P8BE-NEXT:    sldi r3, r3, 48
-; P8BE-NEXT:    subf r4, r6, r4
-; P8BE-NEXT:    mtvsrd v3, r3
-; P8BE-NEXT:    sldi r3, r5, 48
 ; P8BE-NEXT:    sldi r4, r4, 48
-; P8BE-NEXT:    mtvsrd v4, r3
-; P8BE-NEXT:    mtvsrd v5, r4
+; P8BE-NEXT:    subf r5, r5, r7
+; P8BE-NEXT:    subf r3, r6, r3
+; P8BE-NEXT:    mtvsrd v3, r4
+; P8BE-NEXT:    sldi r4, r5, 48
+; P8BE-NEXT:    sldi r3, r3, 48
+; P8BE-NEXT:    mtvsrd v4, r4
+; P8BE-NEXT:    mtvsrd v5, r3
 ; P8BE-NEXT:    vmrghh v3, v4, v3
 ; P8BE-NEXT:    vmrghh v2, v2, v5
 ; P8BE-NEXT:    vmrghw v2, v2, v3

diff  --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
index ba568c5d153b..ce8f179ff837 100644
--- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
@@ -15,21 +15,21 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    lis r5, 21399
 ; P9LE-NEXT:    ori r5, r5, 33437
-; P9LE-NEXT:    rlwinm r4, r3, 0, 16, 31
-; P9LE-NEXT:    mulld r4, r4, r5
+; P9LE-NEXT:    clrlwi r4, r3, 16
+; P9LE-NEXT:    mulhwu r4, r4, r5
 ; P9LE-NEXT:    lis r5, 16727
 ; P9LE-NEXT:    ori r5, r5, 2287
-; P9LE-NEXT:    rldicl r4, r4, 27, 37
+; P9LE-NEXT:    srwi r4, r4, 5
 ; P9LE-NEXT:    mulli r4, r4, 98
 ; P9LE-NEXT:    subf r3, r4, r3
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    rlwinm r4, r3, 0, 16, 31
-; P9LE-NEXT:    mulld r4, r4, r5
+; P9LE-NEXT:    clrlwi r4, r3, 16
+; P9LE-NEXT:    mulhwu r4, r4, r5
 ; P9LE-NEXT:    lis r5, 8456
 ; P9LE-NEXT:    ori r5, r5, 16913
-; P9LE-NEXT:    rldicl r4, r4, 24, 40
+; P9LE-NEXT:    srwi r4, r4, 8
 ; P9LE-NEXT:    mulli r4, r4, 1003
 ; P9LE-NEXT:    subf r3, r4, r3
 ; P9LE-NEXT:    xxswapd v3, vs0
@@ -37,8 +37,10 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 2
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    rlwinm r4, r3, 30, 18, 31
-; P9LE-NEXT:    mulld r4, r4, r5
-; P9LE-NEXT:    rldicl r4, r4, 30, 34
+; P9LE-NEXT:    mulhwu r4, r4, r5
+; P9LE-NEXT:    lis r5, 22765
+; P9LE-NEXT:    ori r5, r5, 8969
+; P9LE-NEXT:    srwi r4, r4, 2
 ; P9LE-NEXT:    mulli r4, r4, 124
 ; P9LE-NEXT:    subf r3, r4, r3
 ; P9LE-NEXT:    xxswapd v4, vs0
@@ -46,19 +48,15 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 0
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r4, r3, 16
-; P9LE-NEXT:    lis r6, 22765
-; P9LE-NEXT:    ori r6, r6, 8969
-; P9LE-NEXT:    vmrglh v3, v4, v3
-; P9LE-NEXT:    xxswapd v4, vs0
-; P9LE-NEXT:    clrldi r5, r4, 32
-; P9LE-NEXT:    mulld r5, r5, r6
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
+; P9LE-NEXT:    mulhwu r5, r4, r5
 ; P9LE-NEXT:    subf r4, r5, r4
 ; P9LE-NEXT:    srwi r4, r4, 1
 ; P9LE-NEXT:    add r4, r4, r5
 ; P9LE-NEXT:    srwi r4, r4, 6
 ; P9LE-NEXT:    mulli r4, r4, 95
 ; P9LE-NEXT:    subf r3, r4, r3
+; P9LE-NEXT:    vmrglh v3, v4, v3
+; P9LE-NEXT:    xxswapd v4, vs0
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    xxswapd v2, vs0
 ; P9LE-NEXT:    vmrglh v2, v4, v2
@@ -69,49 +67,45 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
+; P9BE-NEXT:    lis r4, 16727
+; P9BE-NEXT:    ori r4, r4, 2287
 ; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    lis r5, 16727
-; P9BE-NEXT:    ori r5, r5, 2287
-; P9BE-NEXT:    clrldi r4, r3, 32
-; P9BE-NEXT:    mulld r4, r4, r5
-; P9BE-NEXT:    lis r5, 21399
-; P9BE-NEXT:    ori r5, r5, 33437
-; P9BE-NEXT:    rldicl r4, r4, 24, 40
+; P9BE-NEXT:    mulhwu r4, r3, r4
+; P9BE-NEXT:    srwi r4, r4, 8
 ; P9BE-NEXT:    mulli r4, r4, 1003
 ; P9BE-NEXT:    subf r3, r4, r3
+; P9BE-NEXT:    lis r4, 21399
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    mtvsrd v3, r3
 ; P9BE-NEXT:    li r3, 4
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    clrldi r4, r3, 32
-; P9BE-NEXT:    mulld r4, r4, r5
-; P9BE-NEXT:    lis r5, 8456
-; P9BE-NEXT:    ori r5, r5, 16913
-; P9BE-NEXT:    rldicl r4, r4, 27, 37
+; P9BE-NEXT:    ori r4, r4, 33437
+; P9BE-NEXT:    mulhwu r4, r3, r4
+; P9BE-NEXT:    srwi r4, r4, 5
 ; P9BE-NEXT:    mulli r4, r4, 98
 ; P9BE-NEXT:    subf r3, r4, r3
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    mtvsrd v4, r3
 ; P9BE-NEXT:    li r3, 2
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
+; P9BE-NEXT:    lis r5, 8456
+; P9BE-NEXT:    ori r5, r5, 16913
+; P9BE-NEXT:    vmrghh v3, v4, v3
 ; P9BE-NEXT:    clrlwi r4, r3, 16
 ; P9BE-NEXT:    rlwinm r3, r3, 30, 18, 31
-; P9BE-NEXT:    mulld r3, r3, r5
-; P9BE-NEXT:    lis r5, 22765
-; P9BE-NEXT:    ori r5, r5, 8969
-; P9BE-NEXT:    rldicl r3, r3, 30, 34
+; P9BE-NEXT:    mulhwu r3, r3, r5
+; P9BE-NEXT:    srwi r3, r3, 2
 ; P9BE-NEXT:    mulli r3, r3, 124
 ; P9BE-NEXT:    subf r3, r3, r4
+; P9BE-NEXT:    lis r4, 22765
 ; P9BE-NEXT:    sldi r3, r3, 48
-; P9BE-NEXT:    vmrghh v3, v4, v3
 ; P9BE-NEXT:    mtvsrd v4, r3
 ; P9BE-NEXT:    li r3, 0
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    clrldi r4, r3, 32
-; P9BE-NEXT:    mulld r4, r4, r5
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    ori r4, r4, 8969
+; P9BE-NEXT:    mulhwu r4, r3, r4
 ; P9BE-NEXT:    subf r5, r4, r3
 ; P9BE-NEXT:    srwi r5, r5, 1
 ; P9BE-NEXT:    add r4, r5, r4
@@ -128,45 +122,43 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
 ; P8LE-NEXT:    lis r3, 22765
-; P8LE-NEXT:    lis r8, 21399
+; P8LE-NEXT:    lis r7, 21399
+; P8LE-NEXT:    lis r10, 16727
 ; P8LE-NEXT:    ori r3, r3, 8969
-; P8LE-NEXT:    ori r8, r8, 33437
+; P8LE-NEXT:    ori r7, r7, 33437
+; P8LE-NEXT:    ori r10, r10, 2287
 ; P8LE-NEXT:    mffprd r4, f0
-; P8LE-NEXT:    clrldi r5, r4, 48
-; P8LE-NEXT:    rldicl r9, r4, 32, 48
-; P8LE-NEXT:    clrlwi r6, r5, 16
-; P8LE-NEXT:    rldicl r10, r4, 16, 48
-; P8LE-NEXT:    rlwinm r11, r9, 0, 16, 31
-; P8LE-NEXT:    clrldi r7, r6, 32
-; P8LE-NEXT:    rlwinm r12, r10, 0, 16, 31
-; P8LE-NEXT:    mulld r3, r7, r3
-; P8LE-NEXT:    lis r7, 16727
-; P8LE-NEXT:    ori r7, r7, 2287
-; P8LE-NEXT:    mulld r8, r11, r8
+; P8LE-NEXT:    clrldi r6, r4, 48
+; P8LE-NEXT:    rldicl r5, r4, 32, 48
+; P8LE-NEXT:    clrlwi r9, r6, 16
+; P8LE-NEXT:    rldicl r8, r4, 16, 48
+; P8LE-NEXT:    clrlwi r11, r5, 16
+; P8LE-NEXT:    mulhwu r3, r9, r3
+; P8LE-NEXT:    clrlwi r12, r8, 16
+; P8LE-NEXT:    mulhwu r7, r11, r7
 ; P8LE-NEXT:    lis r11, 8456
 ; P8LE-NEXT:    rldicl r4, r4, 48, 48
-; P8LE-NEXT:    mulld r7, r12, r7
+; P8LE-NEXT:    mulhwu r10, r12, r10
 ; P8LE-NEXT:    ori r11, r11, 16913
 ; P8LE-NEXT:    rlwinm r12, r4, 30, 18, 31
-; P8LE-NEXT:    rldicl r3, r3, 32, 32
-; P8LE-NEXT:    mulld r11, r12, r11
-; P8LE-NEXT:    subf r6, r3, r6
-; P8LE-NEXT:    rldicl r8, r8, 27, 37
-; P8LE-NEXT:    srwi r6, r6, 1
-; P8LE-NEXT:    add r3, r6, r3
-; P8LE-NEXT:    rldicl r6, r7, 24, 40
-; P8LE-NEXT:    mulli r7, r8, 98
+; P8LE-NEXT:    mulhwu r11, r12, r11
+; P8LE-NEXT:    subf r9, r3, r9
+; P8LE-NEXT:    srwi r9, r9, 1
+; P8LE-NEXT:    srwi r7, r7, 5
+; P8LE-NEXT:    add r3, r9, r3
+; P8LE-NEXT:    srwi r9, r10, 8
 ; P8LE-NEXT:    srwi r3, r3, 6
-; P8LE-NEXT:    rldicl r8, r11, 30, 34
-; P8LE-NEXT:    mulli r6, r6, 1003
+; P8LE-NEXT:    mulli r7, r7, 98
+; P8LE-NEXT:    srwi r10, r11, 2
+; P8LE-NEXT:    mulli r9, r9, 1003
 ; P8LE-NEXT:    mulli r3, r3, 95
-; P8LE-NEXT:    mulli r8, r8, 124
-; P8LE-NEXT:    subf r7, r7, r9
-; P8LE-NEXT:    subf r6, r6, r10
-; P8LE-NEXT:    mtfprd f0, r7
-; P8LE-NEXT:    subf r3, r3, r5
-; P8LE-NEXT:    subf r4, r8, r4
-; P8LE-NEXT:    mtfprd f1, r6
+; P8LE-NEXT:    mulli r10, r10, 124
+; P8LE-NEXT:    subf r5, r7, r5
+; P8LE-NEXT:    subf r7, r9, r8
+; P8LE-NEXT:    mtfprd f0, r5
+; P8LE-NEXT:    subf r3, r3, r6
+; P8LE-NEXT:    subf r4, r10, r4
+; P8LE-NEXT:    mtfprd f1, r7
 ; P8LE-NEXT:    mtfprd f2, r3
 ; P8LE-NEXT:    xxswapd v2, vs0
 ; P8LE-NEXT:    mtfprd f3, r4
@@ -182,47 +174,43 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
 ; P8BE:       # %bb.0:
 ; P8BE-NEXT:    mfvsrd r4, v2
 ; P8BE-NEXT:    lis r3, 22765
-; P8BE-NEXT:    lis r9, 16727
+; P8BE-NEXT:    lis r7, 16727
+; P8BE-NEXT:    lis r9, 21399
+; P8BE-NEXT:    lis r10, 8456
 ; P8BE-NEXT:    ori r3, r3, 8969
-; P8BE-NEXT:    ori r9, r9, 2287
-; P8BE-NEXT:    rldicl r5, r4, 16, 48
-; P8BE-NEXT:    clrldi r6, r4, 48
-; P8BE-NEXT:    clrlwi r5, r5, 16
-; P8BE-NEXT:    rldicl r7, r4, 48, 48
+; P8BE-NEXT:    ori r7, r7, 2287
+; P8BE-NEXT:    ori r9, r9, 33437
+; P8BE-NEXT:    ori r10, r10, 16913
+; P8BE-NEXT:    rldicl r6, r4, 16, 48
+; P8BE-NEXT:    clrldi r5, r4, 48
 ; P8BE-NEXT:    clrlwi r6, r6, 16
-; P8BE-NEXT:    clrldi r8, r5, 32
-; P8BE-NEXT:    clrlwi r7, r7, 16
-; P8BE-NEXT:    mulld r3, r8, r3
-; P8BE-NEXT:    lis r8, 21399
-; P8BE-NEXT:    clrldi r10, r6, 32
-; P8BE-NEXT:    ori r8, r8, 33437
-; P8BE-NEXT:    clrldi r11, r7, 32
-; P8BE-NEXT:    mulld r9, r10, r9
-; P8BE-NEXT:    lis r10, 8456
+; P8BE-NEXT:    rldicl r8, r4, 48, 48
+; P8BE-NEXT:    clrlwi r5, r5, 16
+; P8BE-NEXT:    mulhwu r3, r6, r3
 ; P8BE-NEXT:    rldicl r4, r4, 32, 48
-; P8BE-NEXT:    mulld r8, r11, r8
-; P8BE-NEXT:    ori r10, r10, 16913
+; P8BE-NEXT:    clrlwi r8, r8, 16
+; P8BE-NEXT:    mulhwu r7, r5, r7
 ; P8BE-NEXT:    rlwinm r11, r4, 30, 18, 31
-; P8BE-NEXT:    rldicl r3, r3, 32, 32
 ; P8BE-NEXT:    clrlwi r4, r4, 16
-; P8BE-NEXT:    mulld r10, r11, r10
-; P8BE-NEXT:    subf r11, r3, r5
+; P8BE-NEXT:    mulhwu r9, r8, r9
+; P8BE-NEXT:    mulhwu r10, r11, r10
+; P8BE-NEXT:    subf r11, r3, r6
 ; P8BE-NEXT:    srwi r11, r11, 1
-; P8BE-NEXT:    rldicl r9, r9, 24, 40
+; P8BE-NEXT:    srwi r7, r7, 8
 ; P8BE-NEXT:    add r3, r11, r3
-; P8BE-NEXT:    rldicl r8, r8, 27, 37
+; P8BE-NEXT:    srwi r9, r9, 5
+; P8BE-NEXT:    srwi r10, r10, 2
+; P8BE-NEXT:    mulli r7, r7, 1003
 ; P8BE-NEXT:    srwi r3, r3, 6
-; P8BE-NEXT:    mulli r9, r9, 1003
-; P8BE-NEXT:    rldicl r10, r10, 30, 34
-; P8BE-NEXT:    mulli r8, r8, 98
+; P8BE-NEXT:    mulli r9, r9, 98
 ; P8BE-NEXT:    mulli r3, r3, 95
 ; P8BE-NEXT:    mulli r10, r10, 124
-; P8BE-NEXT:    subf r6, r9, r6
-; P8BE-NEXT:    subf r7, r8, r7
-; P8BE-NEXT:    sldi r6, r6, 48
-; P8BE-NEXT:    subf r3, r3, r5
+; P8BE-NEXT:    subf r5, r7, r5
+; P8BE-NEXT:    subf r7, r9, r8
+; P8BE-NEXT:    sldi r5, r5, 48
+; P8BE-NEXT:    subf r3, r3, r6
 ; P8BE-NEXT:    subf r4, r10, r4
-; P8BE-NEXT:    mtvsrd v2, r6
+; P8BE-NEXT:    mtvsrd v2, r5
 ; P8BE-NEXT:    sldi r5, r7, 48
 ; P8BE-NEXT:    sldi r3, r3, 48
 ; P8BE-NEXT:    sldi r4, r4, 48
@@ -242,15 +230,13 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 0
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
+; P9LE-NEXT:    lis r5, 22765
+; P9LE-NEXT:    ori r5, r5, 8969
 ; P9LE-NEXT:    clrlwi r4, r3, 16
-; P9LE-NEXT:    lis r6, 22765
-; P9LE-NEXT:    ori r6, r6, 8969
-; P9LE-NEXT:    clrldi r5, r4, 32
-; P9LE-NEXT:    mulld r5, r5, r6
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
-; P9LE-NEXT:    subf r4, r5, r4
+; P9LE-NEXT:    mulhwu r6, r4, r5
+; P9LE-NEXT:    subf r4, r6, r4
 ; P9LE-NEXT:    srwi r4, r4, 1
-; P9LE-NEXT:    add r4, r4, r5
+; P9LE-NEXT:    add r4, r4, r6
 ; P9LE-NEXT:    srwi r4, r4, 6
 ; P9LE-NEXT:    mulli r4, r4, 95
 ; P9LE-NEXT:    subf r3, r4, r3
@@ -258,12 +244,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 2
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r4, r3, 16
-; P9LE-NEXT:    clrldi r5, r4, 32
-; P9LE-NEXT:    mulld r5, r5, r6
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
-; P9LE-NEXT:    subf r4, r5, r4
+; P9LE-NEXT:    mulhwu r6, r4, r5
+; P9LE-NEXT:    subf r4, r6, r4
 ; P9LE-NEXT:    srwi r4, r4, 1
-; P9LE-NEXT:    add r4, r4, r5
+; P9LE-NEXT:    add r4, r4, r6
 ; P9LE-NEXT:    srwi r4, r4, 6
 ; P9LE-NEXT:    mulli r4, r4, 95
 ; P9LE-NEXT:    subf r3, r4, r3
@@ -272,12 +256,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 4
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r4, r3, 16
-; P9LE-NEXT:    clrldi r5, r4, 32
-; P9LE-NEXT:    mulld r5, r5, r6
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
-; P9LE-NEXT:    subf r4, r5, r4
+; P9LE-NEXT:    mulhwu r6, r4, r5
+; P9LE-NEXT:    subf r4, r6, r4
 ; P9LE-NEXT:    srwi r4, r4, 1
-; P9LE-NEXT:    add r4, r4, r5
+; P9LE-NEXT:    add r4, r4, r6
 ; P9LE-NEXT:    srwi r4, r4, 6
 ; P9LE-NEXT:    mulli r4, r4, 95
 ; P9LE-NEXT:    subf r3, r4, r3
@@ -286,9 +268,7 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r4, r3, 16
-; P9LE-NEXT:    clrldi r5, r4, 32
-; P9LE-NEXT:    mulld r5, r5, r6
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
+; P9LE-NEXT:    mulhwu r5, r4, r5
 ; P9LE-NEXT:    subf r4, r5, r4
 ; P9LE-NEXT:    srwi r4, r4, 1
 ; P9LE-NEXT:    add r4, r4, r5
@@ -307,55 +287,47 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
+; P9BE-NEXT:    lis r4, 22765
+; P9BE-NEXT:    ori r4, r4, 8969
 ; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    lis r5, 22765
-; P9BE-NEXT:    ori r5, r5, 8969
-; P9BE-NEXT:    clrldi r4, r3, 32
-; P9BE-NEXT:    mulld r4, r4, r5
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
-; P9BE-NEXT:    subf r6, r4, r3
+; P9BE-NEXT:    mulhwu r5, r3, r4
+; P9BE-NEXT:    subf r6, r5, r3
 ; P9BE-NEXT:    srwi r6, r6, 1
-; P9BE-NEXT:    add r4, r6, r4
-; P9BE-NEXT:    srwi r4, r4, 6
-; P9BE-NEXT:    mulli r4, r4, 95
-; P9BE-NEXT:    subf r3, r4, r3
+; P9BE-NEXT:    add r5, r6, r5
+; P9BE-NEXT:    srwi r5, r5, 6
+; P9BE-NEXT:    mulli r5, r5, 95
+; P9BE-NEXT:    subf r3, r5, r3
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    mtvsrd v3, r3
 ; P9BE-NEXT:    li r3, 4
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    clrldi r4, r3, 32
-; P9BE-NEXT:    mulld r4, r4, r5
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
-; P9BE-NEXT:    subf r6, r4, r3
+; P9BE-NEXT:    mulhwu r5, r3, r4
+; P9BE-NEXT:    subf r6, r5, r3
 ; P9BE-NEXT:    srwi r6, r6, 1
-; P9BE-NEXT:    add r4, r6, r4
-; P9BE-NEXT:    srwi r4, r4, 6
-; P9BE-NEXT:    mulli r4, r4, 95
-; P9BE-NEXT:    subf r3, r4, r3
+; P9BE-NEXT:    add r5, r6, r5
+; P9BE-NEXT:    srwi r5, r5, 6
+; P9BE-NEXT:    mulli r5, r5, 95
+; P9BE-NEXT:    subf r3, r5, r3
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    mtvsrd v4, r3
 ; P9BE-NEXT:    li r3, 2
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    clrldi r4, r3, 32
-; P9BE-NEXT:    mulld r4, r4, r5
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
-; P9BE-NEXT:    subf r6, r4, r3
+; P9BE-NEXT:    mulhwu r5, r3, r4
+; P9BE-NEXT:    subf r6, r5, r3
 ; P9BE-NEXT:    srwi r6, r6, 1
-; P9BE-NEXT:    add r4, r6, r4
-; P9BE-NEXT:    srwi r4, r4, 6
-; P9BE-NEXT:    mulli r4, r4, 95
-; P9BE-NEXT:    subf r3, r4, r3
+; P9BE-NEXT:    add r5, r6, r5
+; P9BE-NEXT:    srwi r5, r5, 6
+; P9BE-NEXT:    mulli r5, r5, 95
+; P9BE-NEXT:    subf r3, r5, r3
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    vmrghh v3, v4, v3
 ; P9BE-NEXT:    mtvsrd v4, r3
 ; P9BE-NEXT:    li r3, 0
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    clrldi r4, r3, 32
-; P9BE-NEXT:    mulld r4, r4, r5
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    mulhwu r4, r3, r4
 ; P9BE-NEXT:    subf r5, r4, r3
 ; P9BE-NEXT:    srwi r5, r5, 1
 ; P9BE-NEXT:    add r4, r5, r4
@@ -371,62 +343,52 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P8LE-LABEL: fold_urem_vec_2:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r4, 22765
-; P8LE-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
+; P8LE-NEXT:    lis r3, 22765
 ; P8LE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT:    ori r4, r4, 8969
-; P8LE-NEXT:    mffprd r5, f0
-; P8LE-NEXT:    clrldi r3, r5, 48
-; P8LE-NEXT:    rldicl r6, r5, 48, 48
-; P8LE-NEXT:    clrlwi r8, r3, 16
-; P8LE-NEXT:    rldicl r7, r5, 32, 48
+; P8LE-NEXT:    ori r3, r3, 8969
+; P8LE-NEXT:    mffprd r4, f0
+; P8LE-NEXT:    clrldi r5, r4, 48
+; P8LE-NEXT:    rldicl r6, r4, 48, 48
+; P8LE-NEXT:    clrlwi r8, r5, 16
+; P8LE-NEXT:    rldicl r7, r4, 32, 48
 ; P8LE-NEXT:    clrlwi r9, r6, 16
-; P8LE-NEXT:    rldicl r5, r5, 16, 48
-; P8LE-NEXT:    clrldi r11, r8, 32
-; P8LE-NEXT:    clrlwi r10, r7, 16
-; P8LE-NEXT:    clrlwi r12, r5, 16
-; P8LE-NEXT:    mulld r11, r11, r4
-; P8LE-NEXT:    clrldi r0, r9, 32
-; P8LE-NEXT:    clrldi r30, r10, 32
-; P8LE-NEXT:    clrldi r29, r12, 32
-; P8LE-NEXT:    mulld r0, r0, r4
-; P8LE-NEXT:    mulld r30, r30, r4
-; P8LE-NEXT:    mulld r4, r29, r4
-; P8LE-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
-; P8LE-NEXT:    rldicl r11, r11, 32, 32
-; P8LE-NEXT:    subf r8, r11, r8
-; P8LE-NEXT:    rldicl r0, r0, 32, 32
+; P8LE-NEXT:    rldicl r4, r4, 16, 48
+; P8LE-NEXT:    mulhwu r10, r8, r3
+; P8LE-NEXT:    clrlwi r11, r7, 16
+; P8LE-NEXT:    clrlwi r0, r4, 16
+; P8LE-NEXT:    mulhwu r12, r9, r3
+; P8LE-NEXT:    mulhwu r30, r11, r3
+; P8LE-NEXT:    mulhwu r3, r0, r3
+; P8LE-NEXT:    subf r8, r10, r8
 ; P8LE-NEXT:    srwi r8, r8, 1
-; P8LE-NEXT:    rldicl r30, r30, 32, 32
-; P8LE-NEXT:    rldicl r4, r4, 32, 32
-; P8LE-NEXT:    subf r9, r0, r9
-; P8LE-NEXT:    add r8, r8, r11
-; P8LE-NEXT:    subf r10, r30, r10
-; P8LE-NEXT:    subf r11, r4, r12
+; P8LE-NEXT:    subf r9, r12, r9
+; P8LE-NEXT:    add r8, r8, r10
+; P8LE-NEXT:    subf r10, r30, r11
+; P8LE-NEXT:    subf r11, r3, r0
 ; P8LE-NEXT:    srwi r9, r9, 1
-; P8LE-NEXT:    srwi r8, r8, 6
 ; P8LE-NEXT:    srwi r10, r10, 1
 ; P8LE-NEXT:    srwi r11, r11, 1
-; P8LE-NEXT:    add r9, r9, r0
+; P8LE-NEXT:    add r9, r9, r12
+; P8LE-NEXT:    srwi r8, r8, 6
 ; P8LE-NEXT:    add r10, r10, r30
-; P8LE-NEXT:    add r4, r11, r4
+; P8LE-NEXT:    add r3, r11, r3
 ; P8LE-NEXT:    srwi r9, r9, 6
 ; P8LE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
 ; P8LE-NEXT:    mulli r8, r8, 95
 ; P8LE-NEXT:    srwi r10, r10, 6
-; P8LE-NEXT:    srwi r4, r4, 6
+; P8LE-NEXT:    srwi r3, r3, 6
 ; P8LE-NEXT:    mulli r9, r9, 95
 ; P8LE-NEXT:    mulli r10, r10, 95
-; P8LE-NEXT:    mulli r4, r4, 95
-; P8LE-NEXT:    subf r3, r8, r3
+; P8LE-NEXT:    mulli r3, r3, 95
+; P8LE-NEXT:    subf r5, r8, r5
 ; P8LE-NEXT:    subf r6, r9, r6
-; P8LE-NEXT:    mtfprd f0, r3
-; P8LE-NEXT:    subf r3, r10, r7
-; P8LE-NEXT:    subf r4, r4, r5
+; P8LE-NEXT:    mtfprd f0, r5
+; P8LE-NEXT:    subf r5, r10, r7
+; P8LE-NEXT:    subf r3, r3, r4
 ; P8LE-NEXT:    mtfprd f1, r6
-; P8LE-NEXT:    mtfprd f2, r3
+; P8LE-NEXT:    mtfprd f2, r5
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    mtfprd f3, r4
+; P8LE-NEXT:    mtfprd f3, r3
 ; P8LE-NEXT:    xxswapd v3, vs1
 ; P8LE-NEXT:    xxswapd v4, vs2
 ; P8LE-NEXT:    xxswapd v5, vs3
@@ -445,24 +407,16 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
 ; P8BE-NEXT:    clrlwi r5, r5, 16
 ; P8BE-NEXT:    rldicl r7, r4, 32, 48
 ; P8BE-NEXT:    clrlwi r6, r6, 16
-; P8BE-NEXT:    clrldi r8, r5, 32
+; P8BE-NEXT:    mulhwu r8, r5, r3
 ; P8BE-NEXT:    rldicl r4, r4, 16, 48
 ; P8BE-NEXT:    clrlwi r7, r7, 16
-; P8BE-NEXT:    clrldi r9, r6, 32
-; P8BE-NEXT:    mulld r8, r8, r3
+; P8BE-NEXT:    mulhwu r9, r6, r3
 ; P8BE-NEXT:    clrlwi r4, r4, 16
-; P8BE-NEXT:    clrldi r10, r7, 32
-; P8BE-NEXT:    mulld r9, r9, r3
-; P8BE-NEXT:    clrldi r11, r4, 32
-; P8BE-NEXT:    mulld r10, r10, r3
-; P8BE-NEXT:    mulld r3, r11, r3
-; P8BE-NEXT:    rldicl r8, r8, 32, 32
-; P8BE-NEXT:    rldicl r9, r9, 32, 32
+; P8BE-NEXT:    mulhwu r10, r7, r3
+; P8BE-NEXT:    mulhwu r3, r4, r3
 ; P8BE-NEXT:    subf r11, r8, r5
-; P8BE-NEXT:    rldicl r10, r10, 32, 32
 ; P8BE-NEXT:    subf r12, r9, r6
 ; P8BE-NEXT:    srwi r11, r11, 1
-; P8BE-NEXT:    rldicl r3, r3, 32, 32
 ; P8BE-NEXT:    add r8, r11, r8
 ; P8BE-NEXT:    subf r11, r10, r7
 ; P8BE-NEXT:    srwi r12, r12, 1
@@ -507,39 +461,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 0
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
+; P9LE-NEXT:    lis r5, 22765
+; P9LE-NEXT:    ori r5, r5, 8969
 ; P9LE-NEXT:    clrlwi r4, r3, 16
-; P9LE-NEXT:    lis r6, 22765
-; P9LE-NEXT:    ori r6, r6, 8969
-; P9LE-NEXT:    clrldi r5, r4, 32
-; P9LE-NEXT:    mulld r5, r5, r6
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
-; P9LE-NEXT:    subf r4, r5, r4
+; P9LE-NEXT:    mulhwu r6, r4, r5
+; P9LE-NEXT:    subf r4, r6, r4
 ; P9LE-NEXT:    srwi r4, r4, 1
-; P9LE-NEXT:    add r4, r4, r5
+; P9LE-NEXT:    add r4, r4, r6
 ; P9LE-NEXT:    srwi r4, r4, 6
-; P9LE-NEXT:    mulli r5, r4, 95
-; P9LE-NEXT:    subf r3, r5, r3
+; P9LE-NEXT:    mulli r6, r4, 95
+; P9LE-NEXT:    subf r3, r6, r3
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    li r3, 2
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    clrlwi r5, r3, 16
-; P9LE-NEXT:    clrldi r7, r5, 32
-; P9LE-NEXT:    mulld r7, r7, r6
-; P9LE-NEXT:    rldicl r7, r7, 32, 32
-; P9LE-NEXT:    subf r5, r7, r5
-; P9LE-NEXT:    srwi r5, r5, 1
-; P9LE-NEXT:    add r5, r5, r7
-; P9LE-NEXT:    srwi r5, r5, 6
-; P9LE-NEXT:    mulli r7, r5, 95
+; P9LE-NEXT:    clrlwi r6, r3, 16
+; P9LE-NEXT:    mulhwu r7, r6, r5
+; P9LE-NEXT:    subf r6, r7, r6
+; P9LE-NEXT:    srwi r6, r6, 1
+; P9LE-NEXT:    add r6, r6, r7
+; P9LE-NEXT:    srwi r6, r6, 6
+; P9LE-NEXT:    mulli r7, r6, 95
 ; P9LE-NEXT:    subf r3, r7, r3
 ; P9LE-NEXT:    xxswapd v3, vs0
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    li r3, 4
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r7, r3, 16
-; P9LE-NEXT:    clrldi r8, r7, 32
-; P9LE-NEXT:    mulld r8, r8, r6
-; P9LE-NEXT:    rldicl r8, r8, 32, 32
+; P9LE-NEXT:    mulhwu r8, r7, r5
 ; P9LE-NEXT:    subf r7, r8, r7
 ; P9LE-NEXT:    srwi r7, r7, 1
 ; P9LE-NEXT:    add r7, r7, r8
@@ -551,14 +499,12 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r8, r3, 16
-; P9LE-NEXT:    clrldi r9, r8, 32
-; P9LE-NEXT:    mulld r6, r9, r6
-; P9LE-NEXT:    rldicl r6, r6, 32, 32
-; P9LE-NEXT:    subf r8, r6, r8
+; P9LE-NEXT:    mulhwu r5, r8, r5
+; P9LE-NEXT:    subf r8, r5, r8
 ; P9LE-NEXT:    srwi r8, r8, 1
-; P9LE-NEXT:    add r6, r8, r6
-; P9LE-NEXT:    srwi r6, r6, 6
-; P9LE-NEXT:    mulli r8, r6, 95
+; P9LE-NEXT:    add r5, r8, r5
+; P9LE-NEXT:    srwi r5, r5, 6
+; P9LE-NEXT:    mulli r8, r5, 95
 ; P9LE-NEXT:    subf r3, r8, r3
 ; P9LE-NEXT:    vmrglh v3, v4, v3
 ; P9LE-NEXT:    xxswapd v4, vs0
@@ -568,12 +514,12 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9LE-NEXT:    vmrglh v2, v2, v4
 ; P9LE-NEXT:    vmrglw v2, v2, v3
 ; P9LE-NEXT:    xxswapd v3, vs0
-; P9LE-NEXT:    mtfprd f0, r5
+; P9LE-NEXT:    mtfprd f0, r6
 ; P9LE-NEXT:    xxswapd v4, vs0
 ; P9LE-NEXT:    mtfprd f0, r7
 ; P9LE-NEXT:    vmrglh v3, v4, v3
 ; P9LE-NEXT:    xxswapd v4, vs0
-; P9LE-NEXT:    mtfprd f0, r6
+; P9LE-NEXT:    mtfprd f0, r5
 ; P9LE-NEXT:    xxswapd v5, vs0
 ; P9LE-NEXT:    vmrglh v4, v5, v4
 ; P9LE-NEXT:    vmrglw v3, v4, v3
@@ -584,40 +530,34 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
+; P9BE-NEXT:    lis r5, 22765
+; P9BE-NEXT:    ori r5, r5, 8969
 ; P9BE-NEXT:    clrlwi r4, r3, 16
-; P9BE-NEXT:    lis r6, 22765
-; P9BE-NEXT:    ori r6, r6, 8969
-; P9BE-NEXT:    clrldi r5, r4, 32
-; P9BE-NEXT:    mulld r5, r5, r6
-; P9BE-NEXT:    rldicl r5, r5, 32, 32
-; P9BE-NEXT:    subf r4, r5, r4
+; P9BE-NEXT:    mulhwu r6, r4, r5
+; P9BE-NEXT:    subf r4, r6, r4
 ; P9BE-NEXT:    srwi r4, r4, 1
-; P9BE-NEXT:    add r4, r4, r5
+; P9BE-NEXT:    add r4, r4, r6
 ; P9BE-NEXT:    srwi r4, r4, 6
-; P9BE-NEXT:    mulli r5, r4, 95
-; P9BE-NEXT:    subf r3, r5, r3
+; P9BE-NEXT:    mulli r6, r4, 95
+; P9BE-NEXT:    subf r3, r6, r3
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    mtvsrd v3, r3
 ; P9BE-NEXT:    li r3, 4
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    clrlwi r5, r3, 16
-; P9BE-NEXT:    clrldi r7, r5, 32
-; P9BE-NEXT:    mulld r7, r7, r6
-; P9BE-NEXT:    rldicl r7, r7, 32, 32
-; P9BE-NEXT:    subf r5, r7, r5
-; P9BE-NEXT:    srwi r5, r5, 1
-; P9BE-NEXT:    add r5, r5, r7
-; P9BE-NEXT:    srwi r5, r5, 6
-; P9BE-NEXT:    mulli r7, r5, 95
+; P9BE-NEXT:    clrlwi r6, r3, 16
+; P9BE-NEXT:    mulhwu r7, r6, r5
+; P9BE-NEXT:    subf r6, r7, r6
+; P9BE-NEXT:    srwi r6, r6, 1
+; P9BE-NEXT:    add r6, r6, r7
+; P9BE-NEXT:    srwi r6, r6, 6
+; P9BE-NEXT:    mulli r7, r6, 95
 ; P9BE-NEXT:    subf r3, r7, r3
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    mtvsrd v4, r3
 ; P9BE-NEXT:    li r3, 2
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r7, r3, 16
-; P9BE-NEXT:    clrldi r8, r7, 32
-; P9BE-NEXT:    mulld r8, r8, r6
-; P9BE-NEXT:    rldicl r8, r8, 32, 32
+; P9BE-NEXT:    mulhwu r8, r7, r5
 ; P9BE-NEXT:    subf r7, r8, r7
 ; P9BE-NEXT:    srwi r7, r7, 1
 ; P9BE-NEXT:    add r7, r7, r8
@@ -630,14 +570,12 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9BE-NEXT:    li r3, 0
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    clrldi r8, r3, 32
-; P9BE-NEXT:    mulld r6, r8, r6
-; P9BE-NEXT:    rldicl r6, r6, 32, 32
-; P9BE-NEXT:    subf r8, r6, r3
+; P9BE-NEXT:    mulhwu r5, r3, r5
+; P9BE-NEXT:    subf r8, r5, r3
 ; P9BE-NEXT:    srwi r8, r8, 1
-; P9BE-NEXT:    add r6, r8, r6
-; P9BE-NEXT:    srwi r6, r6, 6
-; P9BE-NEXT:    mulli r8, r6, 95
+; P9BE-NEXT:    add r5, r8, r5
+; P9BE-NEXT:    srwi r5, r5, 6
+; P9BE-NEXT:    mulli r8, r5, 95
 ; P9BE-NEXT:    subf r3, r8, r3
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    mtvsrd v2, r3
@@ -645,12 +583,12 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P9BE-NEXT:    vmrghh v2, v2, v4
 ; P9BE-NEXT:    vmrghw v2, v2, v3
 ; P9BE-NEXT:    mtvsrd v3, r3
-; P9BE-NEXT:    sldi r3, r5, 48
+; P9BE-NEXT:    sldi r3, r6, 48
 ; P9BE-NEXT:    mtvsrd v4, r3
 ; P9BE-NEXT:    sldi r3, r7, 48
 ; P9BE-NEXT:    vmrghh v3, v4, v3
 ; P9BE-NEXT:    mtvsrd v4, r3
-; P9BE-NEXT:    sldi r3, r6, 48
+; P9BE-NEXT:    sldi r3, r5, 48
 ; P9BE-NEXT:    mtvsrd v5, r3
 ; P9BE-NEXT:    vmrghh v4, v5, v4
 ; P9BE-NEXT:    vmrghw v3, v4, v3
@@ -660,68 +598,58 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ; P8LE-LABEL: combine_urem_udiv:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    lis r5, 22765
+; P8LE-NEXT:    lis r4, 22765
 ; P8LE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT:    std r29, -24(r1) # 8-byte Folded Spill
-; P8LE-NEXT:    ori r5, r5, 8969
-; P8LE-NEXT:    mffprd r6, f0
-; P8LE-NEXT:    clrldi r3, r6, 48
-; P8LE-NEXT:    rldicl r4, r6, 48, 48
-; P8LE-NEXT:    rldicl r7, r6, 32, 48
+; P8LE-NEXT:    ori r4, r4, 8969
+; P8LE-NEXT:    mffprd r5, f0
+; P8LE-NEXT:    clrldi r3, r5, 48
+; P8LE-NEXT:    rldicl r6, r5, 48, 48
 ; P8LE-NEXT:    clrlwi r8, r3, 16
-; P8LE-NEXT:    clrlwi r9, r4, 16
-; P8LE-NEXT:    rldicl r6, r6, 16, 48
-; P8LE-NEXT:    clrlwi r10, r7, 16
-; P8LE-NEXT:    clrldi r11, r8, 32
-; P8LE-NEXT:    clrlwi r12, r6, 16
-; P8LE-NEXT:    clrldi r0, r9, 32
-; P8LE-NEXT:    clrldi r30, r10, 32
-; P8LE-NEXT:    mulld r11, r11, r5
-; P8LE-NEXT:    clrldi r29, r12, 32
-; P8LE-NEXT:    mulld r0, r0, r5
-; P8LE-NEXT:    mulld r30, r30, r5
-; P8LE-NEXT:    mulld r5, r29, r5
-; P8LE-NEXT:    ld r29, -24(r1) # 8-byte Folded Reload
-; P8LE-NEXT:    rldicl r11, r11, 32, 32
-; P8LE-NEXT:    rldicl r0, r0, 32, 32
-; P8LE-NEXT:    rldicl r30, r30, 32, 32
-; P8LE-NEXT:    subf r8, r11, r8
-; P8LE-NEXT:    rldicl r5, r5, 32, 32
-; P8LE-NEXT:    subf r9, r0, r9
+; P8LE-NEXT:    rldicl r7, r5, 32, 48
+; P8LE-NEXT:    clrlwi r9, r6, 16
+; P8LE-NEXT:    mulhwu r10, r8, r4
+; P8LE-NEXT:    clrlwi r11, r7, 16
+; P8LE-NEXT:    rldicl r5, r5, 16, 48
+; P8LE-NEXT:    mulhwu r12, r9, r4
+; P8LE-NEXT:    mulhwu r0, r11, r4
+; P8LE-NEXT:    clrlwi r30, r5, 16
+; P8LE-NEXT:    mulhwu r4, r30, r4
+; P8LE-NEXT:    subf r8, r10, r8
 ; P8LE-NEXT:    srwi r8, r8, 1
-; P8LE-NEXT:    subf r10, r30, r10
-; P8LE-NEXT:    add r8, r8, r11
+; P8LE-NEXT:    subf r9, r12, r9
+; P8LE-NEXT:    add r8, r8, r10
+; P8LE-NEXT:    subf r10, r0, r11
 ; P8LE-NEXT:    srwi r9, r9, 1
 ; P8LE-NEXT:    srwi r10, r10, 1
-; P8LE-NEXT:    subf r11, r5, r12
-; P8LE-NEXT:    add r9, r9, r0
+; P8LE-NEXT:    subf r11, r4, r30
+; P8LE-NEXT:    add r9, r9, r12
 ; P8LE-NEXT:    srwi r8, r8, 6
-; P8LE-NEXT:    add r10, r10, r30
-; P8LE-NEXT:    srwi r11, r11, 1
 ; P8LE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
+; P8LE-NEXT:    add r10, r10, r0
+; P8LE-NEXT:    srwi r11, r11, 1
 ; P8LE-NEXT:    srwi r9, r9, 6
+; P8LE-NEXT:    mtfprd f0, r8
 ; P8LE-NEXT:    mulli r12, r8, 95
 ; P8LE-NEXT:    srwi r10, r10, 6
-; P8LE-NEXT:    add r5, r11, r5
-; P8LE-NEXT:    mtfprd f0, r8
-; P8LE-NEXT:    mulli r8, r9, 95
+; P8LE-NEXT:    add r4, r11, r4
 ; P8LE-NEXT:    mtfprd f1, r9
+; P8LE-NEXT:    mulli r8, r9, 95
 ; P8LE-NEXT:    mulli r9, r10, 95
-; P8LE-NEXT:    srwi r5, r5, 6
-; P8LE-NEXT:    mtfprd f3, r5
-; P8LE-NEXT:    mulli r5, r5, 95
+; P8LE-NEXT:    srwi r4, r4, 6
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    xxswapd v3, vs1
 ; P8LE-NEXT:    mtfprd f2, r10
+; P8LE-NEXT:    mtfprd f3, r4
+; P8LE-NEXT:    mulli r4, r4, 95
+; P8LE-NEXT:    xxswapd v3, vs1
+; P8LE-NEXT:    xxswapd v1, vs2
 ; P8LE-NEXT:    subf r3, r12, r3
 ; P8LE-NEXT:    xxswapd v6, vs3
 ; P8LE-NEXT:    mtfprd f0, r3
 ; P8LE-NEXT:    subf r3, r9, r7
-; P8LE-NEXT:    subf r4, r8, r4
-; P8LE-NEXT:    xxswapd v1, vs2
+; P8LE-NEXT:    subf r6, r8, r6
 ; P8LE-NEXT:    mtfprd f4, r3
-; P8LE-NEXT:    subf r3, r5, r6
-; P8LE-NEXT:    mtfprd f1, r4
+; P8LE-NEXT:    subf r3, r4, r5
+; P8LE-NEXT:    mtfprd f1, r6
 ; P8LE-NEXT:    mtfprd f5, r3
 ; P8LE-NEXT:    xxswapd v5, vs4
 ; P8LE-NEXT:    vmrglh v2, v3, v2
@@ -738,71 +666,61 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
 ;
 ; P8BE-LABEL: combine_urem_udiv:
 ; P8BE:       # %bb.0:
-; P8BE-NEXT:    mfvsrd r6, v2
-; P8BE-NEXT:    lis r5, 22765
-; P8BE-NEXT:    std r30, -16(r1) # 8-byte Folded Spill
-; P8BE-NEXT:    ori r5, r5, 8969
-; P8BE-NEXT:    clrldi r3, r6, 48
-; P8BE-NEXT:    rldicl r4, r6, 48, 48
+; P8BE-NEXT:    mfvsrd r5, v2
+; P8BE-NEXT:    lis r4, 22765
+; P8BE-NEXT:    ori r4, r4, 8969
+; P8BE-NEXT:    clrldi r3, r5, 48
+; P8BE-NEXT:    rldicl r6, r5, 48, 48
 ; P8BE-NEXT:    clrlwi r8, r3, 16
-; P8BE-NEXT:    rldicl r7, r6, 32, 48
-; P8BE-NEXT:    clrlwi r9, r4, 16
-; P8BE-NEXT:    rldicl r6, r6, 16, 48
-; P8BE-NEXT:    clrldi r11, r8, 32
-; P8BE-NEXT:    clrlwi r10, r7, 16
-; P8BE-NEXT:    clrlwi r6, r6, 16
-; P8BE-NEXT:    clrldi r12, r9, 32
-; P8BE-NEXT:    mulld r11, r11, r5
-; P8BE-NEXT:    clrldi r0, r10, 32
-; P8BE-NEXT:    clrldi r30, r6, 32
-; P8BE-NEXT:    mulld r12, r12, r5
-; P8BE-NEXT:    mulld r0, r0, r5
-; P8BE-NEXT:    mulld r5, r30, r5
-; P8BE-NEXT:    ld r30, -16(r1) # 8-byte Folded Reload
-; P8BE-NEXT:    rldicl r11, r11, 32, 32
-; P8BE-NEXT:    rldicl r12, r12, 32, 32
-; P8BE-NEXT:    subf r8, r11, r8
-; P8BE-NEXT:    rldicl r5, r5, 32, 32
+; P8BE-NEXT:    rldicl r7, r5, 32, 48
+; P8BE-NEXT:    clrlwi r9, r6, 16
+; P8BE-NEXT:    rldicl r5, r5, 16, 48
+; P8BE-NEXT:    mulhwu r10, r8, r4
+; P8BE-NEXT:    clrlwi r11, r7, 16
+; P8BE-NEXT:    mulhwu r12, r9, r4
+; P8BE-NEXT:    clrlwi r5, r5, 16
+; P8BE-NEXT:    mulhwu r0, r11, r4
+; P8BE-NEXT:    mulhwu r4, r5, r4
+; P8BE-NEXT:    subf r8, r10, r8
 ; P8BE-NEXT:    subf r9, r12, r9
 ; P8BE-NEXT:    srwi r8, r8, 1
-; P8BE-NEXT:    rldicl r0, r0, 32, 32
-; P8BE-NEXT:    add r8, r8, r11
+; P8BE-NEXT:    add r8, r8, r10
+; P8BE-NEXT:    subf r10, r0, r11
 ; P8BE-NEXT:    srwi r9, r9, 1
-; P8BE-NEXT:    subf r11, r5, r6
-; P8BE-NEXT:    subf r10, r0, r10
+; P8BE-NEXT:    subf r11, r4, r5
 ; P8BE-NEXT:    add r9, r9, r12
 ; P8BE-NEXT:    srwi r8, r8, 6
 ; P8BE-NEXT:    srwi r11, r11, 1
 ; P8BE-NEXT:    srwi r10, r10, 1
 ; P8BE-NEXT:    srwi r9, r9, 6
-; P8BE-NEXT:    add r5, r11, r5
 ; P8BE-NEXT:    mulli r12, r8, 95
+; P8BE-NEXT:    add r4, r11, r4
 ; P8BE-NEXT:    add r10, r10, r0
-; P8BE-NEXT:    srwi r5, r5, 6
 ; P8BE-NEXT:    mulli r11, r9, 95
-; P8BE-NEXT:    sldi r9, r9, 48
+; P8BE-NEXT:    srwi r4, r4, 6
 ; P8BE-NEXT:    srwi r10, r10, 6
+; P8BE-NEXT:    sldi r9, r9, 48
 ; P8BE-NEXT:    sldi r8, r8, 48
 ; P8BE-NEXT:    mtvsrd v3, r9
-; P8BE-NEXT:    mulli r9, r5, 95
+; P8BE-NEXT:    mulli r9, r4, 95
 ; P8BE-NEXT:    mtvsrd v2, r8
 ; P8BE-NEXT:    mulli r8, r10, 95
-; P8BE-NEXT:    sldi r10, r10, 48
 ; P8BE-NEXT:    subf r3, r12, r3
-; P8BE-NEXT:    mtvsrd v4, r10
-; P8BE-NEXT:    subf r4, r11, r4
+; P8BE-NEXT:    subf r6, r11, r6
 ; P8BE-NEXT:    sldi r3, r3, 48
 ; P8BE-NEXT:    vmrghh v2, v3, v2
-; P8BE-NEXT:    sldi r4, r4, 48
+; P8BE-NEXT:    sldi r6, r6, 48
+; P8BE-NEXT:    sldi r10, r10, 48
 ; P8BE-NEXT:    mtvsrd v3, r3
-; P8BE-NEXT:    subf r3, r9, r6
+; P8BE-NEXT:    subf r3, r9, r5
 ; P8BE-NEXT:    subf r7, r8, r7
-; P8BE-NEXT:    mtvsrd v5, r4
+; P8BE-NEXT:    mtvsrd v5, r6
 ; P8BE-NEXT:    sldi r3, r3, 48
-; P8BE-NEXT:    sldi r6, r7, 48
+; P8BE-NEXT:    sldi r5, r7, 48
 ; P8BE-NEXT:    mtvsrd v1, r3
-; P8BE-NEXT:    sldi r3, r5, 48
-; P8BE-NEXT:    mtvsrd v0, r6
+; P8BE-NEXT:    sldi r3, r4, 48
+; P8BE-NEXT:    mtvsrd v4, r10
+; P8BE-NEXT:    mtvsrd v0, r5
 ; P8BE-NEXT:    vmrghh v3, v5, v3
 ; P8BE-NEXT:    mtvsrd v5, r3
 ; P8BE-NEXT:    vmrghh v0, v1, v0
@@ -832,14 +750,11 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    clrlwi r4, r3, 16
-; P9LE-NEXT:    lis r6, 22765
-; P9LE-NEXT:    ori r6, r6, 8969
+; P9LE-NEXT:    lis r5, 22765
+; P9LE-NEXT:    ori r5, r5, 8969
 ; P9LE-NEXT:    xxswapd v4, vs0
-; P9LE-NEXT:    vmrglh v3, v4, v3
-; P9LE-NEXT:    clrldi r5, r4, 32
-; P9LE-NEXT:    mulld r5, r5, r6
-; P9LE-NEXT:    rldicl r5, r5, 32, 32
+; P9LE-NEXT:    clrlwi r4, r3, 16
+; P9LE-NEXT:    mulhwu r5, r4, r5
 ; P9LE-NEXT:    subf r4, r5, r4
 ; P9LE-NEXT:    srwi r4, r4, 1
 ; P9LE-NEXT:    add r4, r4, r5
@@ -850,6 +765,7 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 4
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    clrlwi r3, r3, 29
+; P9LE-NEXT:    vmrglh v3, v4, v3
 ; P9LE-NEXT:    xxswapd v4, vs0
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    xxswapd v2, vs0
@@ -871,13 +787,11 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P9BE-NEXT:    mtvsrd v4, r3
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
-; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    lis r5, 22765
-; P9BE-NEXT:    ori r5, r5, 8969
+; P9BE-NEXT:    lis r4, 22765
+; P9BE-NEXT:    ori r4, r4, 8969
 ; P9BE-NEXT:    vmrghh v3, v4, v3
-; P9BE-NEXT:    clrldi r4, r3, 32
-; P9BE-NEXT:    mulld r4, r4, r5
-; P9BE-NEXT:    rldicl r4, r4, 32, 32
+; P9BE-NEXT:    clrlwi r3, r3, 16
+; P9BE-NEXT:    mulhwu r4, r3, r4
 ; P9BE-NEXT:    subf r5, r4, r3
 ; P9BE-NEXT:    srwi r5, r5, 1
 ; P9BE-NEXT:    add r4, r5, r4
@@ -902,28 +816,26 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P8LE-NEXT:    ori r3, r3, 8969
 ; P8LE-NEXT:    mffprd r4, f0
 ; P8LE-NEXT:    rldicl r5, r4, 16, 48
-; P8LE-NEXT:    clrlwi r6, r5, 16
-; P8LE-NEXT:    clrldi r7, r6, 32
-; P8LE-NEXT:    mulld r3, r7, r3
 ; P8LE-NEXT:    rldicl r7, r4, 48, 48
-; P8LE-NEXT:    clrlwi r7, r7, 27
-; P8LE-NEXT:    mtfprd f1, r7
-; P8LE-NEXT:    rldicl r3, r3, 32, 32
-; P8LE-NEXT:    xxswapd v3, vs1
+; P8LE-NEXT:    clrlwi r6, r5, 16
+; P8LE-NEXT:    mulhwu r3, r6, r3
 ; P8LE-NEXT:    subf r6, r3, r6
 ; P8LE-NEXT:    srwi r6, r6, 1
 ; P8LE-NEXT:    add r3, r6, r3
 ; P8LE-NEXT:    clrldi r6, r4, 48
 ; P8LE-NEXT:    srwi r3, r3, 6
-; P8LE-NEXT:    rldicl r4, r4, 32, 48
 ; P8LE-NEXT:    clrlwi r6, r6, 26
 ; P8LE-NEXT:    mulli r3, r3, 95
-; P8LE-NEXT:    clrlwi r4, r4, 29
+; P8LE-NEXT:    rldicl r4, r4, 32, 48
 ; P8LE-NEXT:    mtfprd f0, r6
+; P8LE-NEXT:    clrlwi r6, r7, 27
+; P8LE-NEXT:    clrlwi r4, r4, 29
+; P8LE-NEXT:    mtfprd f1, r6
 ; P8LE-NEXT:    mtfprd f3, r4
 ; P8LE-NEXT:    xxswapd v2, vs0
-; P8LE-NEXT:    xxswapd v5, vs3
+; P8LE-NEXT:    xxswapd v3, vs1
 ; P8LE-NEXT:    subf r3, r3, r5
+; P8LE-NEXT:    xxswapd v5, vs3
 ; P8LE-NEXT:    mtfprd f2, r3
 ; P8LE-NEXT:    vmrglh v2, v3, v2
 ; P8LE-NEXT:    xxswapd v4, vs2
@@ -940,9 +852,7 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
 ; P8BE-NEXT:    rldicl r7, r4, 16, 48
 ; P8BE-NEXT:    clrlwi r5, r5, 16
 ; P8BE-NEXT:    clrlwi r7, r7, 26
-; P8BE-NEXT:    clrldi r6, r5, 32
-; P8BE-NEXT:    mulld r3, r6, r3
-; P8BE-NEXT:    rldicl r3, r3, 32, 32
+; P8BE-NEXT:    mulhwu r3, r5, r3
 ; P8BE-NEXT:    subf r6, r3, r5
 ; P8BE-NEXT:    srwi r6, r6, 1
 ; P8BE-NEXT:    add r3, r6, r3
@@ -974,25 +884,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; P9LE-LABEL: dont_fold_urem_one:
 ; P9LE:       # %bb.0:
 ; P9LE-NEXT:    li r3, 4
-; P9LE-NEXT:    li r5, 0
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    oris r6, r5, 45590
-; P9LE-NEXT:    oris r5, r5, 51306
-; P9LE-NEXT:    ori r6, r6, 17097
-; P9LE-NEXT:    ori r5, r5, 30865
-; P9LE-NEXT:    rlwinm r4, r3, 0, 16, 31
-; P9LE-NEXT:    mulld r4, r4, r6
-; P9LE-NEXT:    lis r6, 24749
-; P9LE-NEXT:    ori r6, r6, 47143
-; P9LE-NEXT:    rldicl r4, r4, 28, 36
+; P9LE-NEXT:    lis r5, -19946
+; P9LE-NEXT:    ori r5, r5, 17097
+; P9LE-NEXT:    clrlwi r4, r3, 16
+; P9LE-NEXT:    mulhwu r4, r4, r5
+; P9LE-NEXT:    lis r5, 24749
+; P9LE-NEXT:    ori r5, r5, 47143
+; P9LE-NEXT:    srwi r4, r4, 4
 ; P9LE-NEXT:    mulli r4, r4, 23
 ; P9LE-NEXT:    subf r3, r4, r3
 ; P9LE-NEXT:    mtfprd f0, r3
 ; P9LE-NEXT:    li r3, 6
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
-; P9LE-NEXT:    rlwinm r4, r3, 0, 16, 31
-; P9LE-NEXT:    mulld r4, r4, r6
-; P9LE-NEXT:    rldicl r4, r4, 21, 43
+; P9LE-NEXT:    clrlwi r4, r3, 16
+; P9LE-NEXT:    mulhwu r4, r4, r5
+; P9LE-NEXT:    lis r5, -14230
+; P9LE-NEXT:    ori r5, r5, 30865
+; P9LE-NEXT:    srwi r4, r4, 11
 ; P9LE-NEXT:    mulli r4, r4, 5423
 ; P9LE-NEXT:    subf r3, r4, r3
 ; P9LE-NEXT:    xxswapd v3, vs0
@@ -1000,8 +909,8 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; P9LE-NEXT:    li r3, 2
 ; P9LE-NEXT:    vextuhrx r3, r3, v2
 ; P9LE-NEXT:    rlwinm r4, r3, 31, 17, 31
-; P9LE-NEXT:    mulld r4, r4, r5
-; P9LE-NEXT:    rldicl r4, r4, 24, 40
+; P9LE-NEXT:    mulhwu r4, r4, r5
+; P9LE-NEXT:    srwi r4, r4, 8
 ; P9LE-NEXT:    mulli r4, r4, 654
 ; P9LE-NEXT:    subf r3, r4, r3
 ; P9LE-NEXT:    xxswapd v4, vs0
@@ -1017,44 +926,41 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; P9BE:       # %bb.0:
 ; P9BE-NEXT:    li r3, 6
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
+; P9BE-NEXT:    lis r4, 24749
+; P9BE-NEXT:    ori r4, r4, 47143
 ; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    lis r5, 24749
-; P9BE-NEXT:    ori r5, r5, 47143
-; P9BE-NEXT:    clrldi r4, r3, 32
-; P9BE-NEXT:    mulld r4, r4, r5
-; P9BE-NEXT:    li r5, 0
-; P9BE-NEXT:    oris r6, r5, 45590
-; P9BE-NEXT:    oris r5, r5, 51306
-; P9BE-NEXT:    ori r6, r6, 17097
-; P9BE-NEXT:    ori r5, r5, 30865
-; P9BE-NEXT:    rldicl r4, r4, 21, 43
+; P9BE-NEXT:    mulhwu r4, r3, r4
+; P9BE-NEXT:    srwi r4, r4, 11
 ; P9BE-NEXT:    mulli r4, r4, 5423
 ; P9BE-NEXT:    subf r3, r4, r3
+; P9BE-NEXT:    lis r4, -19946
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    mtvsrd v3, r3
 ; P9BE-NEXT:    li r3, 4
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
 ; P9BE-NEXT:    clrlwi r3, r3, 16
-; P9BE-NEXT:    clrldi r4, r3, 32
-; P9BE-NEXT:    mulld r4, r4, r6
-; P9BE-NEXT:    rldicl r4, r4, 28, 36
+; P9BE-NEXT:    ori r4, r4, 17097
+; P9BE-NEXT:    mulhwu r4, r3, r4
+; P9BE-NEXT:    srwi r4, r4, 4
 ; P9BE-NEXT:    mulli r4, r4, 23
 ; P9BE-NEXT:    subf r3, r4, r3
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    mtvsrd v4, r3
 ; P9BE-NEXT:    li r3, 2
 ; P9BE-NEXT:    vextuhlx r3, r3, v2
+; P9BE-NEXT:    lis r5, -14230
+; P9BE-NEXT:    ori r5, r5, 30865
+; P9BE-NEXT:    vmrghh v3, v4, v3
 ; P9BE-NEXT:    clrlwi r4, r3, 16
 ; P9BE-NEXT:    rlwinm r3, r3, 31, 17, 31
-; P9BE-NEXT:    mulld r3, r3, r5
-; P9BE-NEXT:    rldicl r3, r3, 24, 40
+; P9BE-NEXT:    mulhwu r3, r3, r5
+; P9BE-NEXT:    srwi r3, r3, 8
 ; P9BE-NEXT:    mulli r3, r3, 654
 ; P9BE-NEXT:    subf r3, r3, r4
 ; P9BE-NEXT:    sldi r3, r3, 48
 ; P9BE-NEXT:    mtvsrd v2, r3
 ; P9BE-NEXT:    li r3, 0
 ; P9BE-NEXT:    sldi r3, r3, 48
-; P9BE-NEXT:    vmrghh v3, v4, v3
 ; P9BE-NEXT:    mtvsrd v4, r3
 ; P9BE-NEXT:    vmrghh v2, v4, v2
 ; P9BE-NEXT:    vmrghw v2, v2, v3
@@ -1063,35 +969,34 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; P8LE-LABEL: dont_fold_urem_one:
 ; P8LE:       # %bb.0:
 ; P8LE-NEXT:    xxswapd vs0, v2
-; P8LE-NEXT:    li r3, 0
-; P8LE-NEXT:    lis r8, 24749
+; P8LE-NEXT:    lis r3, -19946
+; P8LE-NEXT:    lis r7, 24749
+; P8LE-NEXT:    lis r9, -14230
 ; P8LE-NEXT:    xxlxor v5, v5, v5
-; P8LE-NEXT:    oris r5, r3, 45590
-; P8LE-NEXT:    ori r8, r8, 47143
-; P8LE-NEXT:    oris r3, r3, 51306
-; P8LE-NEXT:    ori r5, r5, 17097
-; P8LE-NEXT:    ori r3, r3, 30865
+; P8LE-NEXT:    ori r3, r3, 17097
+; P8LE-NEXT:    ori r7, r7, 47143
+; P8LE-NEXT:    ori r9, r9, 30865
 ; P8LE-NEXT:    mffprd r4, f0
-; P8LE-NEXT:    rldicl r6, r4, 32, 48
-; P8LE-NEXT:    rldicl r7, r4, 16, 48
-; P8LE-NEXT:    rlwinm r9, r6, 0, 16, 31
+; P8LE-NEXT:    rldicl r5, r4, 32, 48
+; P8LE-NEXT:    rldicl r6, r4, 16, 48
+; P8LE-NEXT:    clrlwi r8, r5, 16
 ; P8LE-NEXT:    rldicl r4, r4, 48, 48
-; P8LE-NEXT:    mulld r5, r9, r5
-; P8LE-NEXT:    rlwinm r9, r7, 0, 16, 31
-; P8LE-NEXT:    mulld r8, r9, r8
-; P8LE-NEXT:    rlwinm r9, r4, 31, 17, 31
-; P8LE-NEXT:    mulld r3, r9, r3
-; P8LE-NEXT:    rldicl r5, r5, 28, 36
-; P8LE-NEXT:    rldicl r8, r8, 21, 43
-; P8LE-NEXT:    mulli r5, r5, 23
-; P8LE-NEXT:    rldicl r3, r3, 24, 40
-; P8LE-NEXT:    mulli r8, r8, 5423
-; P8LE-NEXT:    mulli r3, r3, 654
-; P8LE-NEXT:    subf r5, r5, r6
-; P8LE-NEXT:    subf r6, r8, r7
-; P8LE-NEXT:    mtfprd f0, r5
-; P8LE-NEXT:    subf r3, r3, r4
-; P8LE-NEXT:    mtfprd f1, r6
+; P8LE-NEXT:    mulhwu r3, r8, r3
+; P8LE-NEXT:    clrlwi r8, r6, 16
+; P8LE-NEXT:    mulhwu r7, r8, r7
+; P8LE-NEXT:    rlwinm r8, r4, 31, 17, 31
+; P8LE-NEXT:    mulhwu r8, r8, r9
+; P8LE-NEXT:    srwi r3, r3, 4
+; P8LE-NEXT:    srwi r7, r7, 11
+; P8LE-NEXT:    mulli r3, r3, 23
+; P8LE-NEXT:    srwi r8, r8, 8
+; P8LE-NEXT:    mulli r7, r7, 5423
+; P8LE-NEXT:    mulli r8, r8, 654
+; P8LE-NEXT:    subf r3, r3, r5
+; P8LE-NEXT:    subf r5, r7, r6
+; P8LE-NEXT:    mtfprd f0, r3
+; P8LE-NEXT:    subf r3, r8, r4
+; P8LE-NEXT:    mtfprd f1, r5
 ; P8LE-NEXT:    mtfprd f2, r3
 ; P8LE-NEXT:    xxswapd v2, vs0
 ; P8LE-NEXT:    xxswapd v3, vs1
@@ -1104,45 +1009,42 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
 ; P8BE-LABEL: dont_fold_urem_one:
 ; P8BE:       # %bb.0:
 ; P8BE-NEXT:    mfvsrd r4, v2
-; P8BE-NEXT:    li r3, 0
-; P8BE-NEXT:    lis r8, 24749
-; P8BE-NEXT:    oris r6, r3, 51306
-; P8BE-NEXT:    ori r8, r8, 47143
-; P8BE-NEXT:    oris r3, r3, 45590
-; P8BE-NEXT:    rldicl r5, r4, 32, 48
-; P8BE-NEXT:    clrldi r7, r4, 48
-; P8BE-NEXT:    ori r6, r6, 30865
-; P8BE-NEXT:    ori r3, r3, 17097
-; P8BE-NEXT:    rldicl r4, r4, 48, 48
-; P8BE-NEXT:    rlwinm r9, r5, 31, 17, 31
-; P8BE-NEXT:    clrlwi r7, r7, 16
+; P8BE-NEXT:    lis r3, 24749
+; P8BE-NEXT:    lis r7, -19946
+; P8BE-NEXT:    lis r8, -14230
+; P8BE-NEXT:    ori r3, r3, 47143
+; P8BE-NEXT:    ori r7, r7, 17097
+; P8BE-NEXT:    ori r8, r8, 30865
+; P8BE-NEXT:    clrldi r5, r4, 48
+; P8BE-NEXT:    rldicl r6, r4, 48, 48
+; P8BE-NEXT:    rldicl r4, r4, 32, 48
 ; P8BE-NEXT:    clrlwi r5, r5, 16
+; P8BE-NEXT:    clrlwi r6, r6, 16
+; P8BE-NEXT:    mulhwu r3, r5, r3
+; P8BE-NEXT:    rlwinm r9, r4, 31, 17, 31
 ; P8BE-NEXT:    clrlwi r4, r4, 16
-; P8BE-NEXT:    mulld r6, r9, r6
-; P8BE-NEXT:    clrldi r9, r7, 32
-; P8BE-NEXT:    mulld r8, r9, r8
-; P8BE-NEXT:    clrldi r9, r4, 32
-; P8BE-NEXT:    mulld r3, r9, r3
+; P8BE-NEXT:    mulhwu r7, r6, r7
+; P8BE-NEXT:    mulhwu r8, r9, r8
 ; P8BE-NEXT:    li r9, 0
-; P8BE-NEXT:    rldicl r6, r6, 24, 40
-; P8BE-NEXT:    mulli r6, r6, 654
-; P8BE-NEXT:    rldicl r8, r8, 21, 43
-; P8BE-NEXT:    rldicl r3, r3, 28, 36
-; P8BE-NEXT:    mulli r8, r8, 5423
-; P8BE-NEXT:    mulli r3, r3, 23
-; P8BE-NEXT:    subf r5, r6, r5
-; P8BE-NEXT:    sldi r6, r9, 48
-; P8BE-NEXT:    mtvsrd v2, r6
-; P8BE-NEXT:    sldi r5, r5, 48
-; P8BE-NEXT:    subf r6, r8, r7
-; P8BE-NEXT:    mtvsrd v3, r5
-; P8BE-NEXT:    subf r3, r3, r4
-; P8BE-NEXT:    sldi r4, r6, 48
+; P8BE-NEXT:    srwi r3, r3, 11
+; P8BE-NEXT:    srwi r7, r7, 4
+; P8BE-NEXT:    mulli r3, r3, 5423
+; P8BE-NEXT:    srwi r8, r8, 8
+; P8BE-NEXT:    mulli r7, r7, 23
+; P8BE-NEXT:    mulli r8, r8, 654
+; P8BE-NEXT:    subf r3, r3, r5
+; P8BE-NEXT:    sldi r5, r9, 48
+; P8BE-NEXT:    mtvsrd v2, r5
+; P8BE-NEXT:    subf r5, r7, r6
 ; P8BE-NEXT:    sldi r3, r3, 48
-; P8BE-NEXT:    mtvsrd v4, r4
+; P8BE-NEXT:    subf r4, r8, r4
+; P8BE-NEXT:    sldi r5, r5, 48
+; P8BE-NEXT:    mtvsrd v3, r3
+; P8BE-NEXT:    sldi r3, r4, 48
+; P8BE-NEXT:    mtvsrd v4, r5
 ; P8BE-NEXT:    mtvsrd v5, r3
-; P8BE-NEXT:    vmrghh v2, v2, v3
-; P8BE-NEXT:    vmrghh v3, v5, v4
+; P8BE-NEXT:    vmrghh v3, v4, v3
+; P8BE-NEXT:    vmrghh v2, v2, v5
 ; P8BE-NEXT:    vmrghw v2, v2, v3
 ; P8BE-NEXT:    blr
   %1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>


        


More information about the llvm-commits mailing list