[llvm] b631f86 - [TLI][PowerPC] Introduce TLI query to check if MULH is cheaper than MUL + SHIFT
Amy Kwan via llvm-commits
llvm-commits at lists.llvm.org
Sat May 23 14:48:48 PDT 2020
Author: Amy Kwan
Date: 2020-05-23T16:47:12-05:00
New Revision: b631f86ac5b9df3f87ae963415d17e35104eca86
URL: https://github.com/llvm/llvm-project/commit/b631f86ac5b9df3f87ae963415d17e35104eca86
DIFF: https://github.com/llvm/llvm-project/commit/b631f86ac5b9df3f87ae963415d17e35104eca86.diff
LOG: [TLI][PowerPC] Introduce TLI query to check if MULH is cheaper than MUL + SHIFT
This patch introduces a TargetLowering query, isMulhCheaperThanMulShift.
Currently in DAG Combine, it will transform mulhs/mulhu into a
wider multiply and a shift if the wide multiply is legal.
This TLI function is implemented on 64-bit PowerPC, as it is more desirable to
have multiply-high over multiply + shift for words and doublewords. Having
multiply-high can also aid in further transformations that can be done.
Differential Revision: https://reviews.llvm.org/D78271
Added:
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/Target/PowerPC/PPCISelLowering.cpp
llvm/lib/Target/PowerPC/PPCISelLowering.h
llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
llvm/test/CodeGen/PowerPC/machine-pre.ll
llvm/test/CodeGen/PowerPC/ppc64-P9-mod.ll
llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 522621471af2..2689838b3e7c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1652,6 +1652,10 @@ class TargetLoweringBase {
virtual bool isJumpTableRelative() const;
+ /// Return true if a mulh[s|u] node for a specific type is cheaper than
+ /// a multiply followed by a shift. This is false by default.
+ virtual bool isMulhCheaperThanMulShift(EVT Type) const { return false; }
+
/// If a physical register, this specifies the register that
/// llvm.savestack/llvm.restorestack should save and restore.
unsigned getStackPointerRegisterToSaveRestore() const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index b076f5e0db49..40ceb5b34ad3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -4118,7 +4118,7 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
// If the type twice as wide is legal, transform the mulhs to a wider multiply
// plus a shift.
- if (VT.isSimple() && !VT.isVector()) {
+ if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
MVT Simple = VT.getSimpleVT();
unsigned SimpleSize = Simple.getSizeInBits();
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
@@ -4174,7 +4174,7 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
// If the type twice as wide is legal, transform the mulhu to a wider multiply
// plus a shift.
- if (VT.isSimple() && !VT.isVector()) {
+ if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
MVT Simple = VT.getSimpleVT();
unsigned SimpleSize = Simple.getSizeInBits();
EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8b1ebba596a0..d42eaa7b7706 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1401,6 +1401,16 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
return VT.isScalarInteger();
}
+/// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a specific
+/// type is cheaper than a multiply followed by a shift.
+/// This is true for words and doublewords on 64-bit PowerPC.
+bool PPCTargetLowering::isMulhCheaperThanMulShift(EVT Type) const {
+ if (Subtarget.isPPC64() && (isOperationLegal(ISD::MULHS, Type) ||
+ isOperationLegal(ISD::MULHU, Type)))
+ return true;
+ return TargetLowering::isMulhCheaperThanMulShift(Type);
+}
+
const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
switch ((PPCISD::NodeType)Opcode) {
case PPCISD::FIRST_NUMBER: break;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index c34fd6aa78be..29d4e54edc67 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -950,6 +950,11 @@ namespace llvm {
Register
getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
+ /// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a
+ /// specific type is cheaper than a multiply followed by a shift.
+ /// This is true for words and doublewords on 64-bit PowerPC.
+ bool isMulhCheaperThanMulShift(EVT Type) const override;
+
/// Override to support customized stack guard loading.
bool useLoadStackGuardNode() const override;
void insertSSPDeclarations(Module &M) const override;
diff --git a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
index 42a2c7828052..2463e9114794 100644
--- a/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
+++ b/llvm/test/CodeGen/PowerPC/loop-instr-form-prepare.ll
@@ -509,10 +509,9 @@ define i64 @test_ds_cross_basic_blocks(i8* %0, i32 signext %1) {
; CHECK-NEXT: bdz .LBB6_9
; CHECK-NEXT: .LBB6_4: #
; CHECK-NEXT: lbzu r0, 1(r5)
-; CHECK-NEXT: clrldi r27, r0, 32
-; CHECK-NEXT: mulld r27, r27, r4
-; CHECK-NEXT: rldicl r27, r27, 31, 33
-; CHECK-NEXT: slwi r26, r27, 1
+; CHECK-NEXT: mulhwu r27, r0, r4
+; CHECK-NEXT: rlwinm r26, r27, 0, 0, 30
+; CHECK-NEXT: srwi r27, r27, 1
; CHECK-NEXT: add r27, r27, r26
; CHECK-NEXT: subf r0, r27, r0
; CHECK-NEXT: cmplwi r0, 1
diff --git a/llvm/test/CodeGen/PowerPC/machine-pre.ll b/llvm/test/CodeGen/PowerPC/machine-pre.ll
index 0a7949725723..38ed67c70989 100644
--- a/llvm/test/CodeGen/PowerPC/machine-pre.ll
+++ b/llvm/test/CodeGen/PowerPC/machine-pre.ll
@@ -91,14 +91,12 @@ define dso_local signext i32 @foo(i32 signext %x, i32 signext %y) nounwind {
; CHECK-P9-NEXT: bl bar
; CHECK-P9-NEXT: nop
; CHECK-P9-NEXT: mr r30, r3
-; CHECK-P9-NEXT: extsw r3, r28
-; CHECK-P9-NEXT: mulld r4, r3, r27
-; CHECK-P9-NEXT: rldicl r5, r4, 1, 63
-; CHECK-P9-NEXT: rldicl r4, r4, 32, 32
-; CHECK-P9-NEXT: add r4, r4, r5
-; CHECK-P9-NEXT: slwi r5, r4, 1
-; CHECK-P9-NEXT: add r4, r4, r5
-; CHECK-P9-NEXT: subf r3, r4, r3
+; CHECK-P9-NEXT: mulhw r3, r28, r27
+; CHECK-P9-NEXT: srwi r4, r3, 31
+; CHECK-P9-NEXT: add r3, r3, r4
+; CHECK-P9-NEXT: slwi r4, r3, 1
+; CHECK-P9-NEXT: add r3, r3, r4
+; CHECK-P9-NEXT: subf r3, r3, r28
; CHECK-P9-NEXT: cmplwi r3, 1
; CHECK-P9-NEXT: beq cr0, .LBB1_1
; CHECK-P9-NEXT: # %bb.5: # %while.cond
diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-mod.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-mod.ll
index 547f9273f5a4..56299427ab9d 100644
--- a/llvm/test/CodeGen/PowerPC/ppc64-P9-mod.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-mod.ll
@@ -205,13 +205,13 @@ entry:
ret i32 %rem
; CHECK-LABEL: modulo_const3_sw
; CHECK-NOT: modsw
-; CHECK: mull
+; CHECK: mulh
; CHECK-NOT: modsw
; CHECK: sub
; CHECK-NOT: modsw
; CHECK: blr
; CHECK-PWR8-LABEL: modulo_const3_sw
-; CHECK-PWR8: mull
+; CHECK-PWR8: mulh
; CHECK-PWR8: sub
; CHECK-PWR8: blr
}
diff --git a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
index 051e467cf39b..cda3fbb52ee8 100644
--- a/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/srem-vector-lkk.ll
@@ -13,12 +13,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: extsh r4, r3
; P9LE-NEXT: lis r5, -21386
; P9LE-NEXT: ori r5, r5, 37253
-; P9LE-NEXT: extsw r4, r4
-; P9LE-NEXT: mulld r5, r4, r5
-; P9LE-NEXT: rldicl r5, r5, 32, 32
+; P9LE-NEXT: extsh r4, r3
+; P9LE-NEXT: mulhw r5, r4, r5
; P9LE-NEXT: add r4, r5, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 6
@@ -30,10 +28,8 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r4, r3
-; P9LE-NEXT: extsw r4, r4
; P9LE-NEXT: ori r5, r5, 63421
-; P9LE-NEXT: mulld r5, r4, r5
-; P9LE-NEXT: rldicl r5, r5, 32, 32
+; P9LE-NEXT: mulhw r5, r4, r5
; P9LE-NEXT: subf r4, r4, r5
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 6
@@ -46,11 +42,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r4, r3
-; P9LE-NEXT: extsw r4, r4
; P9LE-NEXT: ori r5, r5, 33437
-; P9LE-NEXT: mulld r4, r4, r5
-; P9LE-NEXT: rldicl r5, r4, 1, 63
-; P9LE-NEXT: rldicl r4, r4, 32, 32
+; P9LE-NEXT: mulhw r4, r4, r5
+; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 5
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: lis r5, -16728
@@ -61,11 +55,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r4, r3
-; P9LE-NEXT: extsw r4, r4
; P9LE-NEXT: ori r5, r5, 63249
-; P9LE-NEXT: mulld r4, r4, r5
-; P9LE-NEXT: rldicl r5, r4, 1, 63
-; P9LE-NEXT: rldicl r4, r4, 32, 32
+; P9LE-NEXT: mulhw r4, r4, r5
+; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 8
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: mulli r4, r4, -1003
@@ -82,12 +74,10 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
-; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: lis r4, 31710
; P9BE-NEXT: ori r4, r4, 63421
-; P9BE-NEXT: extsw r3, r3
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: extsh r3, r3
+; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: subf r4, r3, r4
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 6
@@ -100,10 +90,8 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
; P9BE-NEXT: ori r4, r4, 37253
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 6
@@ -116,11 +104,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
; P9BE-NEXT: ori r4, r4, 63249
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r5, r4, 1, 63
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: mulhw r4, r3, r4
+; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 8
; P9BE-NEXT: add r4, r4, r5
; P9BE-NEXT: mulli r4, r4, -1003
@@ -132,11 +118,9 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
; P9BE-NEXT: ori r4, r4, 33437
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r5, r4, 1, 63
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: mulhw r4, r3, r4
+; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 5
; P9BE-NEXT: add r4, r4, r5
; P9BE-NEXT: mulli r4, r4, 98
@@ -150,61 +134,51 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P8LE-LABEL: fold_srem_vec_1:
; P8LE: # %bb.0:
; P8LE-NEXT: xxswapd vs0, v2
-; P8LE-NEXT: lis r4, 21399
-; P8LE-NEXT: lis r9, -16728
-; P8LE-NEXT: lis r11, -21386
-; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT: ori r4, r4, 33437
-; P8LE-NEXT: ori r9, r9, 63249
-; P8LE-NEXT: ori r11, r11, 37253
-; P8LE-NEXT: mffprd r5, f0
-; P8LE-NEXT: rldicl r3, r5, 32, 48
-; P8LE-NEXT: rldicl r6, r5, 16, 48
-; P8LE-NEXT: clrldi r7, r5, 48
-; P8LE-NEXT: extsh r8, r3
-; P8LE-NEXT: extsh r10, r6
-; P8LE-NEXT: rldicl r5, r5, 48, 48
-; P8LE-NEXT: extsw r8, r8
-; P8LE-NEXT: extsh r12, r7
-; P8LE-NEXT: extsw r10, r10
-; P8LE-NEXT: mulld r4, r8, r4
-; P8LE-NEXT: lis r8, 31710
-; P8LE-NEXT: extsh r0, r5
-; P8LE-NEXT: extsw r12, r12
-; P8LE-NEXT: mulld r9, r10, r9
-; P8LE-NEXT: ori r8, r8, 63421
-; P8LE-NEXT: extsw r10, r0
-; P8LE-NEXT: mulld r11, r12, r11
-; P8LE-NEXT: mulld r8, r10, r8
-; P8LE-NEXT: rldicl r0, r4, 1, 63
-; P8LE-NEXT: rldicl r4, r4, 32, 32
-; P8LE-NEXT: rldicl r30, r9, 1, 63
-; P8LE-NEXT: rldicl r9, r9, 32, 32
-; P8LE-NEXT: rldicl r11, r11, 32, 32
-; P8LE-NEXT: rldicl r8, r8, 32, 32
-; P8LE-NEXT: add r11, r11, r12
-; P8LE-NEXT: srawi r4, r4, 5
-; P8LE-NEXT: subf r8, r10, r8
-; P8LE-NEXT: srawi r9, r9, 8
-; P8LE-NEXT: srwi r10, r11, 31
-; P8LE-NEXT: add r4, r4, r0
-; P8LE-NEXT: srawi r11, r11, 6
-; P8LE-NEXT: add r9, r9, r30
-; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
-; P8LE-NEXT: add r10, r11, r10
-; P8LE-NEXT: srwi r11, r8, 31
-; P8LE-NEXT: srawi r8, r8, 6
-; P8LE-NEXT: mulli r4, r4, 98
-; P8LE-NEXT: mulli r9, r9, -1003
-; P8LE-NEXT: add r8, r8, r11
-; P8LE-NEXT: mulli r10, r10, 95
-; P8LE-NEXT: mulli r8, r8, -124
-; P8LE-NEXT: subf r3, r4, r3
-; P8LE-NEXT: subf r4, r9, r6
+; P8LE-NEXT: lis r3, 21399
+; P8LE-NEXT: lis r9, -21386
+; P8LE-NEXT: lis r11, 31710
+; P8LE-NEXT: lis r8, -16728
+; P8LE-NEXT: ori r3, r3, 33437
+; P8LE-NEXT: ori r9, r9, 37253
+; P8LE-NEXT: ori r8, r8, 63249
+; P8LE-NEXT: mffprd r4, f0
+; P8LE-NEXT: rldicl r5, r4, 32, 48
+; P8LE-NEXT: clrldi r7, r4, 48
+; P8LE-NEXT: rldicl r6, r4, 16, 48
+; P8LE-NEXT: rldicl r4, r4, 48, 48
+; P8LE-NEXT: extsh r10, r5
+; P8LE-NEXT: extsh r0, r7
+; P8LE-NEXT: mulhw r3, r10, r3
+; P8LE-NEXT: ori r10, r11, 63421
+; P8LE-NEXT: extsh r11, r4
+; P8LE-NEXT: extsh r12, r6
+; P8LE-NEXT: mulhw r9, r0, r9
+; P8LE-NEXT: mulhw r10, r11, r10
+; P8LE-NEXT: mulhw r8, r12, r8
+; P8LE-NEXT: srwi r12, r3, 31
+; P8LE-NEXT: srawi r3, r3, 5
+; P8LE-NEXT: add r9, r9, r0
+; P8LE-NEXT: subf r10, r11, r10
+; P8LE-NEXT: add r3, r3, r12
+; P8LE-NEXT: srwi r11, r9, 31
+; P8LE-NEXT: srawi r9, r9, 6
+; P8LE-NEXT: srwi r12, r8, 31
+; P8LE-NEXT: srawi r8, r8, 8
+; P8LE-NEXT: add r9, r9, r11
+; P8LE-NEXT: srwi r11, r10, 31
+; P8LE-NEXT: srawi r10, r10, 6
+; P8LE-NEXT: add r8, r8, r12
+; P8LE-NEXT: mulli r3, r3, 98
+; P8LE-NEXT: add r10, r10, r11
+; P8LE-NEXT: mulli r8, r8, -1003
+; P8LE-NEXT: mulli r9, r9, 95
+; P8LE-NEXT: mulli r10, r10, -124
+; P8LE-NEXT: subf r3, r3, r5
+; P8LE-NEXT: subf r5, r8, r6
; P8LE-NEXT: mtfprd f0, r3
-; P8LE-NEXT: subf r3, r10, r7
-; P8LE-NEXT: mtfprd f1, r4
-; P8LE-NEXT: subf r4, r8, r5
+; P8LE-NEXT: subf r3, r9, r7
+; P8LE-NEXT: subf r4, r10, r4
+; P8LE-NEXT: mtfprd f1, r5
; P8LE-NEXT: mtfprd f2, r3
; P8LE-NEXT: xxswapd v2, vs0
; P8LE-NEXT: mtfprd f3, r4
@@ -220,42 +194,34 @@ define <4 x i16> @fold_srem_vec_1(<4 x i16> %x) {
; P8BE: # %bb.0:
; P8BE-NEXT: mfvsrd r4, v2
; P8BE-NEXT: lis r3, -16728
-; P8BE-NEXT: lis r9, 31710
; P8BE-NEXT: lis r8, 21399
+; P8BE-NEXT: lis r9, 31710
; P8BE-NEXT: lis r10, -21386
; P8BE-NEXT: ori r3, r3, 63249
-; P8BE-NEXT: ori r9, r9, 63421
; P8BE-NEXT: ori r8, r8, 33437
+; P8BE-NEXT: ori r9, r9, 63421
; P8BE-NEXT: ori r10, r10, 37253
; P8BE-NEXT: clrldi r5, r4, 48
-; P8BE-NEXT: rldicl r7, r4, 32, 48
; P8BE-NEXT: rldicl r6, r4, 48, 48
-; P8BE-NEXT: rldicl r4, r4, 16, 48
+; P8BE-NEXT: rldicl r7, r4, 32, 48
; P8BE-NEXT: extsh r5, r5
-; P8BE-NEXT: extsh r7, r7
; P8BE-NEXT: extsh r6, r6
-; P8BE-NEXT: extsw r5, r5
+; P8BE-NEXT: rldicl r4, r4, 16, 48
+; P8BE-NEXT: extsh r7, r7
+; P8BE-NEXT: mulhw r3, r5, r3
; P8BE-NEXT: extsh r4, r4
-; P8BE-NEXT: extsw r7, r7
-; P8BE-NEXT: extsw r6, r6
-; P8BE-NEXT: mulld r3, r5, r3
-; P8BE-NEXT: extsw r4, r4
-; P8BE-NEXT: mulld r9, r7, r9
-; P8BE-NEXT: mulld r8, r6, r8
-; P8BE-NEXT: mulld r10, r4, r10
-; P8BE-NEXT: rldicl r11, r3, 1, 63
-; P8BE-NEXT: rldicl r3, r3, 32, 32
-; P8BE-NEXT: rldicl r9, r9, 32, 32
-; P8BE-NEXT: rldicl r12, r8, 1, 63
-; P8BE-NEXT: rldicl r8, r8, 32, 32
-; P8BE-NEXT: rldicl r10, r10, 32, 32
-; P8BE-NEXT: subf r9, r7, r9
+; P8BE-NEXT: mulhw r8, r6, r8
+; P8BE-NEXT: mulhw r9, r7, r9
+; P8BE-NEXT: mulhw r10, r4, r10
+; P8BE-NEXT: srwi r11, r3, 31
; P8BE-NEXT: srawi r3, r3, 8
+; P8BE-NEXT: add r3, r3, r11
+; P8BE-NEXT: srwi r11, r8, 31
+; P8BE-NEXT: subf r9, r7, r9
; P8BE-NEXT: srawi r8, r8, 5
; P8BE-NEXT: add r10, r10, r4
-; P8BE-NEXT: add r3, r3, r11
+; P8BE-NEXT: add r8, r8, r11
; P8BE-NEXT: srwi r11, r9, 31
-; P8BE-NEXT: add r8, r8, r12
; P8BE-NEXT: srawi r9, r9, 6
; P8BE-NEXT: mulli r3, r3, -1003
; P8BE-NEXT: add r9, r9, r11
@@ -290,12 +256,10 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: extsh r4, r3
; P9LE-NEXT: lis r5, -21386
; P9LE-NEXT: ori r5, r5, 37253
-; P9LE-NEXT: extsw r4, r4
-; P9LE-NEXT: mulld r6, r4, r5
-; P9LE-NEXT: rldicl r6, r6, 32, 32
+; P9LE-NEXT: extsh r4, r3
+; P9LE-NEXT: mulhw r6, r4, r5
; P9LE-NEXT: add r4, r6, r4
; P9LE-NEXT: srwi r6, r4, 31
; P9LE-NEXT: srawi r4, r4, 6
@@ -306,9 +270,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r4, r3
-; P9LE-NEXT: extsw r4, r4
-; P9LE-NEXT: mulld r6, r4, r5
-; P9LE-NEXT: rldicl r6, r6, 32, 32
+; P9LE-NEXT: mulhw r6, r4, r5
; P9LE-NEXT: add r4, r6, r4
; P9LE-NEXT: srwi r6, r4, 31
; P9LE-NEXT: srawi r4, r4, 6
@@ -320,9 +282,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r4, r3
-; P9LE-NEXT: extsw r4, r4
-; P9LE-NEXT: mulld r6, r4, r5
-; P9LE-NEXT: rldicl r6, r6, 32, 32
+; P9LE-NEXT: mulhw r6, r4, r5
; P9LE-NEXT: add r4, r6, r4
; P9LE-NEXT: srwi r6, r4, 31
; P9LE-NEXT: srawi r4, r4, 6
@@ -334,9 +294,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r4, r3
-; P9LE-NEXT: extsw r4, r4
-; P9LE-NEXT: mulld r5, r4, r5
-; P9LE-NEXT: rldicl r5, r5, 32, 32
+; P9LE-NEXT: mulhw r5, r4, r5
; P9LE-NEXT: add r4, r5, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 6
@@ -355,12 +313,10 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
-; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: lis r4, -21386
; P9BE-NEXT: ori r4, r4, 37253
-; P9BE-NEXT: extsw r3, r3
-; P9BE-NEXT: mulld r5, r3, r4
-; P9BE-NEXT: rldicl r5, r5, 32, 32
+; P9BE-NEXT: extsh r3, r3
+; P9BE-NEXT: mulhw r5, r3, r4
; P9BE-NEXT: add r5, r5, r3
; P9BE-NEXT: srwi r6, r5, 31
; P9BE-NEXT: srawi r5, r5, 6
@@ -372,9 +328,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
-; P9BE-NEXT: mulld r5, r3, r4
-; P9BE-NEXT: rldicl r5, r5, 32, 32
+; P9BE-NEXT: mulhw r5, r3, r4
; P9BE-NEXT: add r5, r5, r3
; P9BE-NEXT: srwi r6, r5, 31
; P9BE-NEXT: srawi r5, r5, 6
@@ -386,9 +340,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
-; P9BE-NEXT: mulld r5, r3, r4
-; P9BE-NEXT: rldicl r5, r5, 32, 32
+; P9BE-NEXT: mulhw r5, r3, r4
; P9BE-NEXT: add r5, r5, r3
; P9BE-NEXT: srwi r6, r5, 31
; P9BE-NEXT: srawi r5, r5, 6
@@ -401,9 +353,7 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 6
@@ -419,64 +369,56 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P8LE-LABEL: fold_srem_vec_2:
; P8LE: # %bb.0:
; P8LE-NEXT: xxswapd vs0, v2
-; P8LE-NEXT: lis r4, -21386
+; P8LE-NEXT: lis r3, -21386
; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT: ori r4, r4, 37253
-; P8LE-NEXT: mffprd r5, f0
-; P8LE-NEXT: clrldi r3, r5, 48
-; P8LE-NEXT: rldicl r7, r5, 32, 48
-; P8LE-NEXT: extsh r8, r3
-; P8LE-NEXT: rldicl r6, r5, 48, 48
-; P8LE-NEXT: extsh r10, r7
-; P8LE-NEXT: rldicl r5, r5, 16, 48
-; P8LE-NEXT: extsw r8, r8
+; P8LE-NEXT: ori r3, r3, 37253
+; P8LE-NEXT: mffprd r4, f0
+; P8LE-NEXT: clrldi r5, r4, 48
+; P8LE-NEXT: rldicl r6, r4, 48, 48
+; P8LE-NEXT: extsh r8, r5
+; P8LE-NEXT: rldicl r7, r4, 32, 48
; P8LE-NEXT: extsh r9, r6
-; P8LE-NEXT: extsw r10, r10
-; P8LE-NEXT: extsh r11, r5
-; P8LE-NEXT: mulld r12, r8, r4
-; P8LE-NEXT: extsw r9, r9
-; P8LE-NEXT: extsw r11, r11
-; P8LE-NEXT: mulld r30, r10, r4
-; P8LE-NEXT: mulld r0, r9, r4
-; P8LE-NEXT: mulld r4, r11, r4
-; P8LE-NEXT: rldicl r12, r12, 32, 32
-; P8LE-NEXT: add r8, r12, r8
-; P8LE-NEXT: rldicl r12, r30, 32, 32
-; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
-; P8LE-NEXT: rldicl r0, r0, 32, 32
-; P8LE-NEXT: rldicl r4, r4, 32, 32
-; P8LE-NEXT: add r10, r12, r10
-; P8LE-NEXT: add r9, r0, r9
-; P8LE-NEXT: srwi r0, r8, 31
-; P8LE-NEXT: add r4, r4, r11
-; P8LE-NEXT: srwi r11, r10, 31
+; P8LE-NEXT: mulhw r10, r8, r3
+; P8LE-NEXT: rldicl r4, r4, 16, 48
+; P8LE-NEXT: extsh r11, r7
+; P8LE-NEXT: mulhw r12, r9, r3
+; P8LE-NEXT: extsh r0, r4
+; P8LE-NEXT: mulhw r30, r11, r3
+; P8LE-NEXT: mulhw r3, r0, r3
+; P8LE-NEXT: add r8, r10, r8
+; P8LE-NEXT: add r9, r12, r9
+; P8LE-NEXT: srwi r10, r8, 31
; P8LE-NEXT: srawi r8, r8, 6
-; P8LE-NEXT: srawi r10, r10, 6
-; P8LE-NEXT: srwi r12, r9, 31
-; P8LE-NEXT: add r8, r8, r0
+; P8LE-NEXT: add r11, r30, r11
+; P8LE-NEXT: add r3, r3, r0
+; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
+; P8LE-NEXT: add r8, r8, r10
+; P8LE-NEXT: srwi r10, r9, 31
; P8LE-NEXT: srawi r9, r9, 6
-; P8LE-NEXT: add r10, r10, r11
-; P8LE-NEXT: srwi r11, r4, 31
-; P8LE-NEXT: srawi r4, r4, 6
-; P8LE-NEXT: add r9, r9, r12
; P8LE-NEXT: mulli r8, r8, 95
-; P8LE-NEXT: add r4, r4, r11
+; P8LE-NEXT: add r9, r9, r10
+; P8LE-NEXT: srwi r10, r11, 31
+; P8LE-NEXT: srawi r11, r11, 6
; P8LE-NEXT: mulli r9, r9, 95
+; P8LE-NEXT: add r10, r11, r10
+; P8LE-NEXT: srwi r11, r3, 31
+; P8LE-NEXT: srawi r3, r3, 6
; P8LE-NEXT: mulli r10, r10, 95
-; P8LE-NEXT: mulli r4, r4, 95
-; P8LE-NEXT: subf r3, r8, r3
+; P8LE-NEXT: subf r5, r8, r5
+; P8LE-NEXT: add r3, r3, r11
+; P8LE-NEXT: mtfprd f0, r5
+; P8LE-NEXT: mulli r3, r3, 95
; P8LE-NEXT: subf r6, r9, r6
-; P8LE-NEXT: mtfprd f0, r3
-; P8LE-NEXT: subf r3, r10, r7
-; P8LE-NEXT: subf r4, r4, r5
; P8LE-NEXT: mtfprd f1, r6
-; P8LE-NEXT: mtfprd f2, r3
; P8LE-NEXT: xxswapd v2, vs0
-; P8LE-NEXT: mtfprd f3, r4
+; P8LE-NEXT: subf r5, r10, r7
+; P8LE-NEXT: mtfprd f2, r5
; P8LE-NEXT: xxswapd v3, vs1
+; P8LE-NEXT: subf r3, r3, r4
+; P8LE-NEXT: mtfprd f3, r3
; P8LE-NEXT: xxswapd v4, vs2
-; P8LE-NEXT: xxswapd v5, vs3
; P8LE-NEXT: vmrglh v2, v3, v2
+; P8LE-NEXT: xxswapd v5, vs3
; P8LE-NEXT: vmrglh v3, v5, v4
; P8LE-NEXT: vmrglw v2, v3, v2
; P8LE-NEXT: blr
@@ -491,29 +433,21 @@ define <4 x i16> @fold_srem_vec_2(<4 x i16> %x) {
; P8BE-NEXT: extsh r5, r5
; P8BE-NEXT: rldicl r7, r4, 32, 48
; P8BE-NEXT: extsh r6, r6
-; P8BE-NEXT: extsw r5, r5
+; P8BE-NEXT: mulhw r8, r5, r3
; P8BE-NEXT: rldicl r4, r4, 16, 48
; P8BE-NEXT: extsh r7, r7
-; P8BE-NEXT: extsw r6, r6
-; P8BE-NEXT: mulld r8, r5, r3
+; P8BE-NEXT: mulhw r9, r6, r3
; P8BE-NEXT: extsh r4, r4
-; P8BE-NEXT: extsw r7, r7
-; P8BE-NEXT: mulld r9, r6, r3
-; P8BE-NEXT: extsw r4, r4
-; P8BE-NEXT: mulld r10, r7, r3
-; P8BE-NEXT: mulld r3, r4, r3
-; P8BE-NEXT: rldicl r8, r8, 32, 32
-; P8BE-NEXT: rldicl r9, r9, 32, 32
+; P8BE-NEXT: mulhw r10, r7, r3
+; P8BE-NEXT: mulhw r3, r4, r3
; P8BE-NEXT: add r8, r8, r5
-; P8BE-NEXT: rldicl r10, r10, 32, 32
; P8BE-NEXT: add r9, r9, r6
; P8BE-NEXT: srwi r11, r8, 31
; P8BE-NEXT: srawi r8, r8, 6
-; P8BE-NEXT: rldicl r3, r3, 32, 32
; P8BE-NEXT: add r10, r10, r7
+; P8BE-NEXT: add r3, r3, r4
; P8BE-NEXT: add r8, r8, r11
; P8BE-NEXT: srwi r11, r9, 31
-; P8BE-NEXT: add r3, r3, r4
; P8BE-NEXT: srawi r9, r9, 6
; P8BE-NEXT: mulli r8, r8, 95
; P8BE-NEXT: add r9, r9, r11
@@ -553,12 +487,10 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: extsh r4, r3
; P9LE-NEXT: lis r5, -21386
; P9LE-NEXT: ori r5, r5, 37253
-; P9LE-NEXT: extsw r4, r4
-; P9LE-NEXT: mulld r6, r4, r5
-; P9LE-NEXT: rldicl r6, r6, 32, 32
+; P9LE-NEXT: extsh r4, r3
+; P9LE-NEXT: mulhw r6, r4, r5
; P9LE-NEXT: add r4, r6, r4
; P9LE-NEXT: srwi r6, r4, 31
; P9LE-NEXT: srawi r4, r4, 6
@@ -569,9 +501,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r6, r3
-; P9LE-NEXT: extsw r6, r6
-; P9LE-NEXT: mulld r7, r6, r5
-; P9LE-NEXT: rldicl r7, r7, 32, 32
+; P9LE-NEXT: mulhw r7, r6, r5
; P9LE-NEXT: add r6, r7, r6
; P9LE-NEXT: srwi r7, r6, 31
; P9LE-NEXT: srawi r6, r6, 6
@@ -583,9 +513,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r7, r3
-; P9LE-NEXT: extsw r7, r7
-; P9LE-NEXT: mulld r8, r7, r5
-; P9LE-NEXT: rldicl r8, r8, 32, 32
+; P9LE-NEXT: mulhw r8, r7, r5
; P9LE-NEXT: add r7, r8, r7
; P9LE-NEXT: srwi r8, r7, 31
; P9LE-NEXT: srawi r7, r7, 6
@@ -597,9 +525,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r8, r3
-; P9LE-NEXT: extsw r8, r8
-; P9LE-NEXT: mulld r5, r8, r5
-; P9LE-NEXT: rldicl r5, r5, 32, 32
+; P9LE-NEXT: mulhw r5, r8, r5
; P9LE-NEXT: add r5, r5, r8
; P9LE-NEXT: srwi r8, r5, 31
; P9LE-NEXT: srawi r5, r5, 6
@@ -630,12 +556,10 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
-; P9BE-NEXT: extsh r4, r3
; P9BE-NEXT: lis r5, -21386
; P9BE-NEXT: ori r5, r5, 37253
-; P9BE-NEXT: extsw r4, r4
-; P9BE-NEXT: mulld r6, r4, r5
-; P9BE-NEXT: rldicl r6, r6, 32, 32
+; P9BE-NEXT: extsh r4, r3
+; P9BE-NEXT: mulhw r6, r4, r5
; P9BE-NEXT: add r4, r6, r4
; P9BE-NEXT: srwi r6, r4, 31
; P9BE-NEXT: srawi r4, r4, 6
@@ -647,9 +571,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r6, r3
-; P9BE-NEXT: extsw r6, r6
-; P9BE-NEXT: mulld r7, r6, r5
-; P9BE-NEXT: rldicl r7, r7, 32, 32
+; P9BE-NEXT: mulhw r7, r6, r5
; P9BE-NEXT: add r6, r7, r6
; P9BE-NEXT: srwi r7, r6, 31
; P9BE-NEXT: srawi r6, r6, 6
@@ -661,9 +583,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r7, r3
-; P9BE-NEXT: extsw r7, r7
-; P9BE-NEXT: mulld r8, r7, r5
-; P9BE-NEXT: rldicl r8, r8, 32, 32
+; P9BE-NEXT: mulhw r8, r7, r5
; P9BE-NEXT: add r7, r8, r7
; P9BE-NEXT: srwi r8, r7, 31
; P9BE-NEXT: srawi r7, r7, 6
@@ -676,9 +596,7 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
-; P9BE-NEXT: mulld r5, r3, r5
-; P9BE-NEXT: rldicl r5, r5, 32, 32
+; P9BE-NEXT: mulhw r5, r3, r5
; P9BE-NEXT: add r5, r5, r3
; P9BE-NEXT: srwi r8, r5, 31
; P9BE-NEXT: srawi r5, r5, 6
@@ -706,66 +624,58 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
; P8LE-LABEL: combine_srem_sdiv:
; P8LE: # %bb.0:
; P8LE-NEXT: xxswapd vs0, v2
-; P8LE-NEXT: lis r5, -21386
+; P8LE-NEXT: lis r4, -21386
; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT: ori r5, r5, 37253
-; P8LE-NEXT: mffprd r6, f0
-; P8LE-NEXT: clrldi r3, r6, 48
-; P8LE-NEXT: rldicl r4, r6, 48, 48
-; P8LE-NEXT: rldicl r7, r6, 32, 48
+; P8LE-NEXT: ori r4, r4, 37253
+; P8LE-NEXT: mffprd r5, f0
+; P8LE-NEXT: clrldi r3, r5, 48
+; P8LE-NEXT: rldicl r6, r5, 48, 48
+; P8LE-NEXT: rldicl r7, r5, 32, 48
; P8LE-NEXT: extsh r8, r3
-; P8LE-NEXT: extsh r9, r4
-; P8LE-NEXT: rldicl r6, r6, 16, 48
+; P8LE-NEXT: extsh r9, r6
; P8LE-NEXT: extsh r10, r7
-; P8LE-NEXT: extsw r8, r8
-; P8LE-NEXT: extsw r9, r9
-; P8LE-NEXT: extsh r11, r6
-; P8LE-NEXT: extsw r10, r10
-; P8LE-NEXT: mulld r12, r8, r5
-; P8LE-NEXT: extsw r11, r11
-; P8LE-NEXT: mulld r0, r9, r5
-; P8LE-NEXT: mulld r30, r10, r5
-; P8LE-NEXT: mulld r5, r11, r5
-; P8LE-NEXT: rldicl r12, r12, 32, 32
-; P8LE-NEXT: rldicl r0, r0, 32, 32
-; P8LE-NEXT: rldicl r30, r30, 32, 32
-; P8LE-NEXT: add r8, r12, r8
-; P8LE-NEXT: rldicl r5, r5, 32, 32
-; P8LE-NEXT: add r9, r0, r9
-; P8LE-NEXT: add r10, r30, r10
-; P8LE-NEXT: srwi r12, r8, 31
-; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
+; P8LE-NEXT: mulhw r11, r8, r4
+; P8LE-NEXT: rldicl r5, r5, 16, 48
+; P8LE-NEXT: mulhw r12, r9, r4
+; P8LE-NEXT: mulhw r0, r10, r4
+; P8LE-NEXT: extsh r30, r5
+; P8LE-NEXT: mulhw r4, r30, r4
+; P8LE-NEXT: add r8, r11, r8
+; P8LE-NEXT: add r9, r12, r9
+; P8LE-NEXT: srwi r11, r8, 31
+; P8LE-NEXT: add r10, r0, r10
; P8LE-NEXT: srawi r8, r8, 6
-; P8LE-NEXT: srawi r0, r9, 6
+; P8LE-NEXT: srawi r12, r9, 6
; P8LE-NEXT: srwi r9, r9, 31
-; P8LE-NEXT: add r5, r5, r11
-; P8LE-NEXT: add r8, r8, r12
-; P8LE-NEXT: srawi r12, r10, 6
+; P8LE-NEXT: add r8, r8, r11
+; P8LE-NEXT: add r4, r4, r30
+; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
+; P8LE-NEXT: srawi r11, r10, 6
; P8LE-NEXT: srwi r10, r10, 31
-; P8LE-NEXT: add r9, r0, r9
-; P8LE-NEXT: mulli r0, r8, 95
-; P8LE-NEXT: add r10, r12, r10
+; P8LE-NEXT: add r9, r12, r9
; P8LE-NEXT: mtfprd f0, r8
-; P8LE-NEXT: srwi r8, r5, 31
-; P8LE-NEXT: srawi r5, r5, 6
-; P8LE-NEXT: mulli r11, r9, 95
+; P8LE-NEXT: mulli r12, r8, 95
+; P8LE-NEXT: add r10, r11, r10
+; P8LE-NEXT: srwi r8, r4, 31
; P8LE-NEXT: mtfprd f1, r9
-; P8LE-NEXT: mulli r9, r10, 95
-; P8LE-NEXT: add r5, r5, r8
+; P8LE-NEXT: srawi r4, r4, 6
+; P8LE-NEXT: mulli r11, r9, 95
; P8LE-NEXT: xxswapd v2, vs0
; P8LE-NEXT: mtfprd f2, r10
-; P8LE-NEXT: mtfprd f3, r5
-; P8LE-NEXT: mulli r5, r5, 95
+; P8LE-NEXT: mulli r9, r10, 95
+; P8LE-NEXT: add r4, r4, r8
; P8LE-NEXT: xxswapd v3, vs1
-; P8LE-NEXT: subf r3, r0, r3
+; P8LE-NEXT: mtfprd f3, r4
+; P8LE-NEXT: mulli r4, r4, 95
; P8LE-NEXT: xxswapd v1, vs2
+; P8LE-NEXT: subf r3, r12, r3
; P8LE-NEXT: mtfprd f0, r3
-; P8LE-NEXT: subf r4, r11, r4
+; P8LE-NEXT: subf r6, r11, r6
; P8LE-NEXT: xxswapd v6, vs3
; P8LE-NEXT: subf r3, r9, r7
-; P8LE-NEXT: mtfprd f1, r4
+; P8LE-NEXT: mtfprd f1, r6
; P8LE-NEXT: mtfprd f4, r3
-; P8LE-NEXT: subf r3, r5, r6
+; P8LE-NEXT: subf r3, r4, r5
; P8LE-NEXT: mtfprd f5, r3
; P8LE-NEXT: xxswapd v4, vs1
; P8LE-NEXT: vmrglh v2, v3, v2
@@ -782,69 +692,61 @@ define <4 x i16> @combine_srem_sdiv(<4 x i16> %x) {
;
; P8BE-LABEL: combine_srem_sdiv:
; P8BE: # %bb.0:
-; P8BE-NEXT: mfvsrd r6, v2
-; P8BE-NEXT: lis r5, -21386
-; P8BE-NEXT: ori r5, r5, 37253
-; P8BE-NEXT: clrldi r3, r6, 48
-; P8BE-NEXT: rldicl r4, r6, 48, 48
+; P8BE-NEXT: mfvsrd r5, v2
+; P8BE-NEXT: lis r4, -21386
+; P8BE-NEXT: ori r4, r4, 37253
+; P8BE-NEXT: clrldi r3, r5, 48
+; P8BE-NEXT: rldicl r6, r5, 48, 48
; P8BE-NEXT: extsh r8, r3
-; P8BE-NEXT: rldicl r7, r6, 32, 48
-; P8BE-NEXT: extsh r9, r4
-; P8BE-NEXT: rldicl r6, r6, 16, 48
-; P8BE-NEXT: extsw r8, r8
+; P8BE-NEXT: rldicl r7, r5, 32, 48
+; P8BE-NEXT: extsh r9, r6
+; P8BE-NEXT: rldicl r5, r5, 16, 48
+; P8BE-NEXT: mulhw r11, r8, r4
; P8BE-NEXT: extsh r10, r7
-; P8BE-NEXT: extsw r9, r9
-; P8BE-NEXT: extsh r6, r6
-; P8BE-NEXT: mulld r11, r8, r5
-; P8BE-NEXT: extsw r10, r10
-; P8BE-NEXT: extsw r6, r6
-; P8BE-NEXT: mulld r12, r9, r5
-; P8BE-NEXT: mulld r0, r10, r5
-; P8BE-NEXT: mulld r5, r6, r5
-; P8BE-NEXT: rldicl r11, r11, 32, 32
-; P8BE-NEXT: rldicl r12, r12, 32, 32
+; P8BE-NEXT: extsh r5, r5
+; P8BE-NEXT: mulhw r12, r9, r4
+; P8BE-NEXT: mulhw r0, r10, r4
+; P8BE-NEXT: mulhw r4, r5, r4
; P8BE-NEXT: add r8, r11, r8
-; P8BE-NEXT: rldicl r0, r0, 32, 32
-; P8BE-NEXT: rldicl r5, r5, 32, 32
; P8BE-NEXT: add r9, r12, r9
; P8BE-NEXT: srawi r11, r8, 6
; P8BE-NEXT: srwi r8, r8, 31
; P8BE-NEXT: add r10, r0, r10
-; P8BE-NEXT: add r5, r5, r6
+; P8BE-NEXT: add r4, r4, r5
+; P8BE-NEXT: add r8, r11, r8
; P8BE-NEXT: srawi r12, r9, 6
; P8BE-NEXT: srwi r9, r9, 31
-; P8BE-NEXT: add r8, r11, r8
; P8BE-NEXT: srawi r0, r10, 6
-; P8BE-NEXT: srawi r11, r5, 6
+; P8BE-NEXT: srawi r11, r4, 6
; P8BE-NEXT: srwi r10, r10, 31
; P8BE-NEXT: add r9, r12, r9
-; P8BE-NEXT: srwi r5, r5, 31
+; P8BE-NEXT: srwi r4, r4, 31
; P8BE-NEXT: mulli r12, r8, 95
; P8BE-NEXT: add r10, r0, r10
-; P8BE-NEXT: add r5, r11, r5
+; P8BE-NEXT: add r4, r11, r4
; P8BE-NEXT: mulli r0, r9, 95
; P8BE-NEXT: sldi r9, r9, 48
; P8BE-NEXT: sldi r8, r8, 48
; P8BE-NEXT: mtvsrd v3, r9
-; P8BE-NEXT: mulli r9, r5, 95
+; P8BE-NEXT: mulli r9, r4, 95
; P8BE-NEXT: mtvsrd v2, r8
; P8BE-NEXT: mulli r8, r10, 95
; P8BE-NEXT: sldi r10, r10, 48
; P8BE-NEXT: subf r3, r12, r3
; P8BE-NEXT: mtvsrd v4, r10
-; P8BE-NEXT: subf r4, r0, r4
+; P8BE-NEXT: subf r6, r0, r6
; P8BE-NEXT: sldi r3, r3, 48
; P8BE-NEXT: vmrghh v2, v3, v2
-; P8BE-NEXT: sldi r4, r4, 48
+; P8BE-NEXT: sldi r6, r6, 48
; P8BE-NEXT: mtvsrd v3, r3
-; P8BE-NEXT: subf r3, r9, r6
+; P8BE-NEXT: subf r3, r9, r5
; P8BE-NEXT: subf r7, r8, r7
-; P8BE-NEXT: mtvsrd v5, r4
+; P8BE-NEXT: mtvsrd v5, r6
; P8BE-NEXT: sldi r3, r3, 48
-; P8BE-NEXT: sldi r6, r7, 48
+; P8BE-NEXT: sldi r5, r7, 48
; P8BE-NEXT: mtvsrd v1, r3
-; P8BE-NEXT: sldi r3, r5, 48
-; P8BE-NEXT: mtvsrd v0, r6
+; P8BE-NEXT: sldi r3, r4, 48
+; P8BE-NEXT: mtvsrd v0, r5
; P8BE-NEXT: vmrghh v3, v5, v3
; P8BE-NEXT: mtvsrd v5, r3
; P8BE-NEXT: vmrghh v0, v1, v0
@@ -882,14 +784,11 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: extsh r4, r3
; P9LE-NEXT: lis r5, -21386
; P9LE-NEXT: ori r5, r5, 37253
; P9LE-NEXT: xxswapd v4, vs0
-; P9LE-NEXT: vmrglh v3, v4, v3
-; P9LE-NEXT: extsw r4, r4
-; P9LE-NEXT: mulld r5, r4, r5
-; P9LE-NEXT: rldicl r5, r5, 32, 32
+; P9LE-NEXT: extsh r4, r3
+; P9LE-NEXT: mulhw r5, r4, r5
; P9LE-NEXT: add r4, r5, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 6
@@ -904,6 +803,7 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; P9LE-NEXT: addze r4, r4
; P9LE-NEXT: slwi r4, r4, 3
; P9LE-NEXT: subf r3, r4, r3
+; P9LE-NEXT: vmrglh v3, v4, v3
; P9LE-NEXT: xxswapd v4, vs0
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: xxswapd v2, vs0
@@ -935,10 +835,8 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
; P9BE-NEXT: ori r4, r4, 37253
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 6
@@ -971,30 +869,28 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; P8LE-NEXT: clrldi r7, r4, 48
; P8LE-NEXT: extsh r6, r5
; P8LE-NEXT: extsh r8, r7
-; P8LE-NEXT: extsw r6, r6
+; P8LE-NEXT: mulhw r3, r6, r3
; P8LE-NEXT: rldicl r9, r4, 48, 48
-; P8LE-NEXT: mulld r3, r6, r3
; P8LE-NEXT: srawi r8, r8, 6
; P8LE-NEXT: extsh r10, r9
; P8LE-NEXT: addze r8, r8
; P8LE-NEXT: rldicl r4, r4, 32, 48
; P8LE-NEXT: srawi r10, r10, 5
; P8LE-NEXT: slwi r8, r8, 6
-; P8LE-NEXT: subf r7, r8, r7
-; P8LE-NEXT: rldicl r3, r3, 32, 32
-; P8LE-NEXT: mtfprd f0, r7
; P8LE-NEXT: add r3, r3, r6
; P8LE-NEXT: addze r6, r10
+; P8LE-NEXT: subf r7, r8, r7
; P8LE-NEXT: srwi r10, r3, 31
; P8LE-NEXT: srawi r3, r3, 6
+; P8LE-NEXT: mtfprd f0, r7
; P8LE-NEXT: slwi r6, r6, 5
-; P8LE-NEXT: xxswapd v2, vs0
; P8LE-NEXT: add r3, r3, r10
; P8LE-NEXT: extsh r10, r4
; P8LE-NEXT: subf r6, r6, r9
; P8LE-NEXT: mulli r3, r3, 95
; P8LE-NEXT: srawi r8, r10, 3
; P8LE-NEXT: mtfprd f1, r6
+; P8LE-NEXT: xxswapd v2, vs0
; P8LE-NEXT: addze r7, r8
; P8LE-NEXT: xxswapd v3, vs1
; P8LE-NEXT: subf r3, r3, r5
@@ -1018,9 +914,8 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; P8BE-NEXT: rldicl r6, r4, 32, 48
; P8BE-NEXT: extsh r5, r5
; P8BE-NEXT: extsh r6, r6
-; P8BE-NEXT: extsw r5, r5
+; P8BE-NEXT: mulhw r3, r5, r3
; P8BE-NEXT: rldicl r7, r4, 16, 48
-; P8BE-NEXT: mulld r3, r5, r3
; P8BE-NEXT: srawi r8, r6, 5
; P8BE-NEXT: extsh r7, r7
; P8BE-NEXT: addze r8, r8
@@ -1028,16 +923,15 @@ define <4 x i16> @dont_fold_srem_power_of_two(<4 x i16> %x) {
; P8BE-NEXT: srawi r9, r7, 6
; P8BE-NEXT: extsh r4, r4
; P8BE-NEXT: slwi r8, r8, 5
+; P8BE-NEXT: add r3, r3, r5
; P8BE-NEXT: addze r9, r9
; P8BE-NEXT: subf r6, r8, r6
-; P8BE-NEXT: rldicl r3, r3, 32, 32
-; P8BE-NEXT: slwi r8, r9, 6
-; P8BE-NEXT: add r3, r3, r5
-; P8BE-NEXT: subf r7, r8, r7
; P8BE-NEXT: srwi r10, r3, 31
; P8BE-NEXT: srawi r3, r3, 6
+; P8BE-NEXT: slwi r8, r9, 6
; P8BE-NEXT: add r3, r3, r10
; P8BE-NEXT: srawi r9, r4, 3
+; P8BE-NEXT: subf r7, r8, r7
; P8BE-NEXT: mulli r3, r3, 95
; P8BE-NEXT: sldi r6, r6, 48
; P8BE-NEXT: addze r8, r9
@@ -1065,13 +959,10 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: extsh r4, r3
; P9LE-NEXT: lis r5, -14230
; P9LE-NEXT: ori r5, r5, 30865
-; P9LE-NEXT: extsw r4, r4
-; P9LE-NEXT: mulld r5, r4, r5
-; P9LE-NEXT: rldicl r5, r5, 32, 32
-; P9LE-NEXT: xxlxor v4, v4, v4
+; P9LE-NEXT: extsh r4, r3
+; P9LE-NEXT: mulhw r5, r4, r5
; P9LE-NEXT: add r4, r5, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 9
@@ -1081,12 +972,11 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9LE-NEXT: subf r3, r4, r3
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: li r3, 4
+; P9LE-NEXT: ori r5, r5, 17097
+; P9LE-NEXT: xxlxor v3, v3, v3
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r4, r3
-; P9LE-NEXT: extsw r4, r4
-; P9LE-NEXT: ori r5, r5, 17097
-; P9LE-NEXT: mulld r5, r4, r5
-; P9LE-NEXT: rldicl r5, r5, 32, 32
+; P9LE-NEXT: mulhw r5, r4, r5
; P9LE-NEXT: add r4, r5, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 4
@@ -1094,21 +984,19 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9LE-NEXT: lis r5, 24749
; P9LE-NEXT: mulli r4, r4, 23
; P9LE-NEXT: subf r3, r4, r3
-; P9LE-NEXT: xxswapd v3, vs0
+; P9LE-NEXT: xxswapd v4, vs0
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r4, r3
-; P9LE-NEXT: extsw r4, r4
; P9LE-NEXT: ori r5, r5, 47143
-; P9LE-NEXT: mulld r4, r4, r5
-; P9LE-NEXT: rldicl r5, r4, 1, 63
-; P9LE-NEXT: rldicl r4, r4, 32, 32
+; P9LE-NEXT: mulhw r4, r4, r5
+; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 11
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: mulli r4, r4, 5423
; P9LE-NEXT: subf r3, r4, r3
-; P9LE-NEXT: vmrglh v3, v3, v4
+; P9LE-NEXT: vmrglh v3, v4, v3
; P9LE-NEXT: xxswapd v4, vs0
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: xxswapd v2, vs0
@@ -1120,12 +1008,10 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
-; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: lis r4, -19946
; P9BE-NEXT: ori r4, r4, 17097
-; P9BE-NEXT: extsw r3, r3
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: extsh r3, r3
+; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 4
@@ -1138,11 +1024,9 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
; P9BE-NEXT: ori r4, r4, 47143
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r5, r4, 1, 63
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: mulhw r4, r3, r4
+; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 11
; P9BE-NEXT: add r4, r4, r5
; P9BE-NEXT: mulli r4, r4, 5423
@@ -1153,10 +1037,8 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
; P9BE-NEXT: ori r4, r4, 30865
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 9
@@ -1177,46 +1059,40 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
; P8LE: # %bb.0:
; P8LE-NEXT: xxswapd vs0, v2
; P8LE-NEXT: lis r3, 24749
-; P8LE-NEXT: lis r8, -19946
-; P8LE-NEXT: lis r10, -14230
+; P8LE-NEXT: lis r7, -19946
+; P8LE-NEXT: lis r9, -14230
; P8LE-NEXT: xxlxor v5, v5, v5
; P8LE-NEXT: ori r3, r3, 47143
-; P8LE-NEXT: ori r8, r8, 17097
+; P8LE-NEXT: ori r7, r7, 17097
; P8LE-NEXT: mffprd r4, f0
; P8LE-NEXT: rldicl r5, r4, 16, 48
; P8LE-NEXT: rldicl r6, r4, 32, 48
; P8LE-NEXT: rldicl r4, r4, 48, 48
-; P8LE-NEXT: extsh r7, r5
-; P8LE-NEXT: extsh r9, r6
-; P8LE-NEXT: extsw r7, r7
-; P8LE-NEXT: extsh r11, r4
-; P8LE-NEXT: extsw r9, r9
-; P8LE-NEXT: mulld r3, r7, r3
-; P8LE-NEXT: ori r7, r10, 30865
-; P8LE-NEXT: extsw r10, r11
-; P8LE-NEXT: mulld r8, r9, r8
-; P8LE-NEXT: mulld r7, r10, r7
-; P8LE-NEXT: rldicl r11, r3, 1, 63
-; P8LE-NEXT: rldicl r3, r3, 32, 32
-; P8LE-NEXT: rldicl r8, r8, 32, 32
-; P8LE-NEXT: rldicl r7, r7, 32, 32
-; P8LE-NEXT: add r8, r8, r9
-; P8LE-NEXT: srawi r3, r3, 11
+; P8LE-NEXT: extsh r8, r5
+; P8LE-NEXT: extsh r10, r6
+; P8LE-NEXT: mulhw r3, r8, r3
+; P8LE-NEXT: ori r8, r9, 30865
+; P8LE-NEXT: extsh r9, r4
+; P8LE-NEXT: mulhw r7, r10, r7
+; P8LE-NEXT: mulhw r8, r9, r8
; P8LE-NEXT: add r7, r7, r10
-; P8LE-NEXT: srwi r9, r8, 31
-; P8LE-NEXT: srawi r8, r8, 4
-; P8LE-NEXT: add r3, r3, r11
+; P8LE-NEXT: srwi r10, r3, 31
; P8LE-NEXT: add r8, r8, r9
+; P8LE-NEXT: srawi r3, r3, 11
; P8LE-NEXT: srwi r9, r7, 31
-; P8LE-NEXT: srawi r7, r7, 9
-; P8LE-NEXT: mulli r3, r3, 5423
+; P8LE-NEXT: srawi r7, r7, 4
+; P8LE-NEXT: add r3, r3, r10
; P8LE-NEXT: add r7, r7, r9
-; P8LE-NEXT: mulli r8, r8, 23
-; P8LE-NEXT: mulli r7, r7, 654
+; P8LE-NEXT: srwi r9, r8, 31
+; P8LE-NEXT: srawi r8, r8, 9
+; P8LE-NEXT: mulli r3, r3, 5423
+; P8LE-NEXT: add r8, r8, r9
+; P8LE-NEXT: mulli r7, r7, 23
+; P8LE-NEXT: mulli r8, r8, 654
; P8LE-NEXT: subf r3, r3, r5
; P8LE-NEXT: mtfprd f0, r3
-; P8LE-NEXT: subf r3, r8, r6
-; P8LE-NEXT: subf r4, r7, r4
+; P8LE-NEXT: subf r3, r7, r6
+; P8LE-NEXT: subf r4, r8, r4
; P8LE-NEXT: mtfprd f1, r3
; P8LE-NEXT: mtfprd f2, r4
; P8LE-NEXT: xxswapd v2, vs0
@@ -1229,54 +1105,48 @@ define <4 x i16> @dont_fold_srem_one(<4 x i16> %x) {
;
; P8BE-LABEL: dont_fold_srem_one:
; P8BE: # %bb.0:
-; P8BE-NEXT: mfvsrd r4, v2
-; P8BE-NEXT: lis r3, 24749
-; P8BE-NEXT: lis r7, -19946
+; P8BE-NEXT: mfvsrd r3, v2
+; P8BE-NEXT: lis r5, 24749
+; P8BE-NEXT: lis r6, -19946
; P8BE-NEXT: lis r8, -14230
-; P8BE-NEXT: ori r3, r3, 47143
-; P8BE-NEXT: ori r7, r7, 17097
+; P8BE-NEXT: ori r5, r5, 47143
+; P8BE-NEXT: ori r6, r6, 17097
; P8BE-NEXT: ori r8, r8, 30865
-; P8BE-NEXT: clrldi r5, r4, 48
-; P8BE-NEXT: rldicl r6, r4, 48, 48
-; P8BE-NEXT: rldicl r4, r4, 32, 48
-; P8BE-NEXT: extsh r5, r5
-; P8BE-NEXT: extsh r6, r6
+; P8BE-NEXT: clrldi r4, r3, 48
+; P8BE-NEXT: rldicl r7, r3, 48, 48
+; P8BE-NEXT: rldicl r3, r3, 32, 48
; P8BE-NEXT: extsh r4, r4
-; P8BE-NEXT: extsw r5, r5
-; P8BE-NEXT: extsw r6, r6
-; P8BE-NEXT: extsw r4, r4
-; P8BE-NEXT: mulld r3, r5, r3
-; P8BE-NEXT: mulld r7, r6, r7
-; P8BE-NEXT: mulld r8, r4, r8
-; P8BE-NEXT: rldicl r9, r3, 1, 63
-; P8BE-NEXT: rldicl r3, r3, 32, 32
-; P8BE-NEXT: rldicl r7, r7, 32, 32
-; P8BE-NEXT: rldicl r8, r8, 32, 32
-; P8BE-NEXT: srawi r3, r3, 11
-; P8BE-NEXT: add r7, r7, r6
-; P8BE-NEXT: add r8, r8, r4
-; P8BE-NEXT: add r3, r3, r9
-; P8BE-NEXT: srwi r9, r7, 31
-; P8BE-NEXT: srawi r7, r7, 4
-; P8BE-NEXT: mulli r3, r3, 5423
-; P8BE-NEXT: add r7, r7, r9
+; P8BE-NEXT: extsh r7, r7
+; P8BE-NEXT: extsh r3, r3
+; P8BE-NEXT: mulhw r5, r4, r5
+; P8BE-NEXT: mulhw r6, r7, r6
+; P8BE-NEXT: mulhw r8, r3, r8
+; P8BE-NEXT: srwi r9, r5, 31
+; P8BE-NEXT: srawi r5, r5, 11
+; P8BE-NEXT: add r6, r6, r7
+; P8BE-NEXT: add r8, r8, r3
+; P8BE-NEXT: add r5, r5, r9
+; P8BE-NEXT: srwi r9, r6, 31
+; P8BE-NEXT: srawi r6, r6, 4
+; P8BE-NEXT: add r6, r6, r9
; P8BE-NEXT: srwi r9, r8, 31
; P8BE-NEXT: srawi r8, r8, 9
-; P8BE-NEXT: mulli r7, r7, 23
+; P8BE-NEXT: mulli r5, r5, 5423
; P8BE-NEXT: add r8, r8, r9
+; P8BE-NEXT: mulli r6, r6, 23
; P8BE-NEXT: li r9, 0
; P8BE-NEXT: mulli r8, r8, 654
-; P8BE-NEXT: subf r3, r3, r5
+; P8BE-NEXT: subf r4, r5, r4
; P8BE-NEXT: sldi r5, r9, 48
-; P8BE-NEXT: sldi r3, r3, 48
; P8BE-NEXT: mtvsrd v2, r5
-; P8BE-NEXT: subf r5, r7, r6
-; P8BE-NEXT: mtvsrd v3, r3
-; P8BE-NEXT: sldi r3, r5, 48
-; P8BE-NEXT: subf r4, r8, r4
-; P8BE-NEXT: mtvsrd v4, r3
+; P8BE-NEXT: subf r5, r6, r7
; P8BE-NEXT: sldi r4, r4, 48
-; P8BE-NEXT: mtvsrd v5, r4
+; P8BE-NEXT: subf r3, r8, r3
+; P8BE-NEXT: mtvsrd v3, r4
+; P8BE-NEXT: sldi r4, r5, 48
+; P8BE-NEXT: sldi r3, r3, 48
+; P8BE-NEXT: mtvsrd v4, r4
+; P8BE-NEXT: mtvsrd v5, r3
; P8BE-NEXT: vmrghh v3, v4, v3
; P8BE-NEXT: vmrghh v2, v2, v5
; P8BE-NEXT: vmrghw v2, v2, v3
@@ -1291,12 +1161,10 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: extsh r4, r3
; P9LE-NEXT: lis r5, -19946
; P9LE-NEXT: ori r5, r5, 17097
-; P9LE-NEXT: extsw r4, r4
-; P9LE-NEXT: mulld r5, r4, r5
-; P9LE-NEXT: rldicl r5, r5, 32, 32
+; P9LE-NEXT: extsh r4, r3
+; P9LE-NEXT: mulhw r5, r4, r5
; P9LE-NEXT: add r4, r5, r4
; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 4
@@ -1308,11 +1176,9 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: extsh r4, r3
-; P9LE-NEXT: extsw r4, r4
; P9LE-NEXT: ori r5, r5, 47143
-; P9LE-NEXT: mulld r4, r4, r5
-; P9LE-NEXT: rldicl r5, r4, 1, 63
-; P9LE-NEXT: rldicl r4, r4, 32, 32
+; P9LE-NEXT: mulhw r4, r4, r5
+; P9LE-NEXT: srwi r5, r4, 31
; P9LE-NEXT: srawi r4, r4, 11
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: mulli r4, r4, 5423
@@ -1339,12 +1205,10 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
-; P9BE-NEXT: extsh r3, r3
; P9BE-NEXT: lis r4, -19946
; P9BE-NEXT: ori r4, r4, 17097
-; P9BE-NEXT: extsw r3, r3
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: extsh r3, r3
+; P9BE-NEXT: mulhw r4, r3, r4
; P9BE-NEXT: add r4, r4, r3
; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 4
@@ -1357,11 +1221,9 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: extsh r3, r3
-; P9BE-NEXT: extsw r3, r3
; P9BE-NEXT: ori r4, r4, 47143
-; P9BE-NEXT: mulld r4, r3, r4
-; P9BE-NEXT: rldicl r5, r4, 1, 63
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: mulhw r4, r3, r4
+; P9BE-NEXT: srwi r5, r4, 31
; P9BE-NEXT: srawi r4, r4, 11
; P9BE-NEXT: add r4, r4, r5
; P9BE-NEXT: mulli r4, r4, 5423
@@ -1388,39 +1250,35 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
; P8LE-LABEL: dont_fold_urem_i16_smax:
; P8LE: # %bb.0:
; P8LE-NEXT: xxswapd vs0, v2
-; P8LE-NEXT: lis r6, 24749
-; P8LE-NEXT: lis r7, -19946
+; P8LE-NEXT: lis r4, 24749
+; P8LE-NEXT: lis r5, -19946
; P8LE-NEXT: xxlxor v5, v5, v5
-; P8LE-NEXT: ori r6, r6, 47143
-; P8LE-NEXT: ori r7, r7, 17097
+; P8LE-NEXT: ori r4, r4, 47143
+; P8LE-NEXT: ori r5, r5, 17097
; P8LE-NEXT: mffprd r3, f0
-; P8LE-NEXT: rldicl r4, r3, 16, 48
-; P8LE-NEXT: rldicl r5, r3, 32, 48
-; P8LE-NEXT: extsh r8, r4
-; P8LE-NEXT: extsh r9, r5
-; P8LE-NEXT: extsw r8, r8
-; P8LE-NEXT: extsw r9, r9
-; P8LE-NEXT: mulld r6, r8, r6
-; P8LE-NEXT: mulld r7, r9, r7
+; P8LE-NEXT: rldicl r6, r3, 16, 48
+; P8LE-NEXT: rldicl r7, r3, 32, 48
+; P8LE-NEXT: extsh r8, r6
+; P8LE-NEXT: extsh r9, r7
+; P8LE-NEXT: mulhw r4, r8, r4
+; P8LE-NEXT: mulhw r5, r9, r5
; P8LE-NEXT: rldicl r3, r3, 48, 48
-; P8LE-NEXT: rldicl r8, r6, 32, 32
-; P8LE-NEXT: rldicl r7, r7, 32, 32
-; P8LE-NEXT: rldicl r6, r6, 1, 63
-; P8LE-NEXT: srawi r8, r8, 11
-; P8LE-NEXT: add r7, r7, r9
-; P8LE-NEXT: add r6, r8, r6
-; P8LE-NEXT: srwi r8, r7, 31
-; P8LE-NEXT: srawi r7, r7, 4
-; P8LE-NEXT: mulli r6, r6, 5423
-; P8LE-NEXT: add r7, r7, r8
+; P8LE-NEXT: srwi r8, r4, 31
+; P8LE-NEXT: srawi r4, r4, 11
+; P8LE-NEXT: add r5, r5, r9
+; P8LE-NEXT: add r4, r4, r8
+; P8LE-NEXT: srwi r8, r5, 31
+; P8LE-NEXT: srawi r5, r5, 4
+; P8LE-NEXT: mulli r4, r4, 5423
+; P8LE-NEXT: add r5, r5, r8
; P8LE-NEXT: extsh r8, r3
-; P8LE-NEXT: mulli r7, r7, 23
+; P8LE-NEXT: mulli r5, r5, 23
; P8LE-NEXT: srawi r8, r8, 15
-; P8LE-NEXT: subf r4, r6, r4
+; P8LE-NEXT: subf r4, r4, r6
; P8LE-NEXT: addze r6, r8
; P8LE-NEXT: mtfprd f0, r4
; P8LE-NEXT: slwi r4, r6, 15
-; P8LE-NEXT: subf r5, r7, r5
+; P8LE-NEXT: subf r5, r5, r7
; P8LE-NEXT: subf r3, r4, r3
; P8LE-NEXT: mtfprd f1, r5
; P8LE-NEXT: xxswapd v2, vs0
@@ -1434,47 +1292,43 @@ define <4 x i16> @dont_fold_urem_i16_smax(<4 x i16> %x) {
;
; P8BE-LABEL: dont_fold_urem_i16_smax:
; P8BE: # %bb.0:
-; P8BE-NEXT: mfvsrd r4, v2
-; P8BE-NEXT: lis r3, 24749
-; P8BE-NEXT: lis r7, -19946
-; P8BE-NEXT: ori r3, r3, 47143
-; P8BE-NEXT: ori r7, r7, 17097
-; P8BE-NEXT: clrldi r5, r4, 48
-; P8BE-NEXT: rldicl r6, r4, 48, 48
-; P8BE-NEXT: extsh r5, r5
+; P8BE-NEXT: mfvsrd r3, v2
+; P8BE-NEXT: lis r4, 24749
+; P8BE-NEXT: lis r5, -19946
+; P8BE-NEXT: ori r4, r4, 47143
+; P8BE-NEXT: ori r5, r5, 17097
+; P8BE-NEXT: clrldi r6, r3, 48
+; P8BE-NEXT: rldicl r7, r3, 48, 48
; P8BE-NEXT: extsh r6, r6
-; P8BE-NEXT: extsw r5, r5
-; P8BE-NEXT: extsw r6, r6
-; P8BE-NEXT: mulld r3, r5, r3
-; P8BE-NEXT: mulld r7, r6, r7
-; P8BE-NEXT: rldicl r4, r4, 32, 48
-; P8BE-NEXT: extsh r4, r4
-; P8BE-NEXT: rldicl r8, r3, 1, 63
-; P8BE-NEXT: rldicl r3, r3, 32, 32
-; P8BE-NEXT: rldicl r7, r7, 32, 32
-; P8BE-NEXT: srawi r3, r3, 11
-; P8BE-NEXT: add r7, r7, r6
-; P8BE-NEXT: add r3, r3, r8
-; P8BE-NEXT: srwi r8, r7, 31
-; P8BE-NEXT: srawi r7, r7, 4
-; P8BE-NEXT: mulli r3, r3, 5423
-; P8BE-NEXT: add r7, r7, r8
+; P8BE-NEXT: extsh r7, r7
+; P8BE-NEXT: mulhw r4, r6, r4
+; P8BE-NEXT: mulhw r5, r7, r5
+; P8BE-NEXT: rldicl r3, r3, 32, 48
+; P8BE-NEXT: extsh r3, r3
+; P8BE-NEXT: srwi r8, r4, 31
+; P8BE-NEXT: srawi r4, r4, 11
+; P8BE-NEXT: add r5, r5, r7
+; P8BE-NEXT: add r4, r4, r8
+; P8BE-NEXT: srwi r8, r5, 31
+; P8BE-NEXT: srawi r5, r5, 4
+; P8BE-NEXT: mulli r4, r4, 5423
+; P8BE-NEXT: add r5, r5, r8
; P8BE-NEXT: li r8, 0
-; P8BE-NEXT: mulli r7, r7, 23
-; P8BE-NEXT: srawi r9, r4, 15
-; P8BE-NEXT: subf r3, r3, r5
-; P8BE-NEXT: sldi r5, r8, 48
+; P8BE-NEXT: mulli r5, r5, 23
+; P8BE-NEXT: srawi r9, r3, 15
+; P8BE-NEXT: subf r4, r4, r6
+; P8BE-NEXT: sldi r6, r8, 48
; P8BE-NEXT: addze r8, r9
-; P8BE-NEXT: mtvsrd v2, r5
-; P8BE-NEXT: subf r5, r7, r6
+; P8BE-NEXT: mtvsrd v2, r6
; P8BE-NEXT: slwi r6, r8, 15
-; P8BE-NEXT: sldi r3, r3, 48
-; P8BE-NEXT: subf r4, r6, r4
-; P8BE-NEXT: mtvsrd v3, r3
-; P8BE-NEXT: sldi r3, r5, 48
; P8BE-NEXT: sldi r4, r4, 48
-; P8BE-NEXT: mtvsrd v4, r3
-; P8BE-NEXT: mtvsrd v5, r4
+; P8BE-NEXT: subf r5, r5, r7
+; P8BE-NEXT: subf r3, r6, r3
+; P8BE-NEXT: mtvsrd v3, r4
+; P8BE-NEXT: sldi r4, r5, 48
+; P8BE-NEXT: sldi r3, r3, 48
+; P8BE-NEXT: mtvsrd v4, r4
+; P8BE-NEXT: mtvsrd v5, r3
; P8BE-NEXT: vmrghh v3, v4, v3
; P8BE-NEXT: vmrghh v2, v2, v5
; P8BE-NEXT: vmrghw v2, v2, v3
diff --git a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
index ba568c5d153b..ce8f179ff837 100644
--- a/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
+++ b/llvm/test/CodeGen/PowerPC/urem-vector-lkk.ll
@@ -15,21 +15,21 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: lis r5, 21399
; P9LE-NEXT: ori r5, r5, 33437
-; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31
-; P9LE-NEXT: mulld r4, r4, r5
+; P9LE-NEXT: clrlwi r4, r3, 16
+; P9LE-NEXT: mulhwu r4, r4, r5
; P9LE-NEXT: lis r5, 16727
; P9LE-NEXT: ori r5, r5, 2287
-; P9LE-NEXT: rldicl r4, r4, 27, 37
+; P9LE-NEXT: srwi r4, r4, 5
; P9LE-NEXT: mulli r4, r4, 98
; P9LE-NEXT: subf r3, r4, r3
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31
-; P9LE-NEXT: mulld r4, r4, r5
+; P9LE-NEXT: clrlwi r4, r3, 16
+; P9LE-NEXT: mulhwu r4, r4, r5
; P9LE-NEXT: lis r5, 8456
; P9LE-NEXT: ori r5, r5, 16913
-; P9LE-NEXT: rldicl r4, r4, 24, 40
+; P9LE-NEXT: srwi r4, r4, 8
; P9LE-NEXT: mulli r4, r4, 1003
; P9LE-NEXT: subf r3, r4, r3
; P9LE-NEXT: xxswapd v3, vs0
@@ -37,8 +37,10 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: rlwinm r4, r3, 30, 18, 31
-; P9LE-NEXT: mulld r4, r4, r5
-; P9LE-NEXT: rldicl r4, r4, 30, 34
+; P9LE-NEXT: mulhwu r4, r4, r5
+; P9LE-NEXT: lis r5, 22765
+; P9LE-NEXT: ori r5, r5, 8969
+; P9LE-NEXT: srwi r4, r4, 2
; P9LE-NEXT: mulli r4, r4, 124
; P9LE-NEXT: subf r3, r4, r3
; P9LE-NEXT: xxswapd v4, vs0
@@ -46,19 +48,15 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r4, r3, 16
-; P9LE-NEXT: lis r6, 22765
-; P9LE-NEXT: ori r6, r6, 8969
-; P9LE-NEXT: vmrglh v3, v4, v3
-; P9LE-NEXT: xxswapd v4, vs0
-; P9LE-NEXT: clrldi r5, r4, 32
-; P9LE-NEXT: mulld r5, r5, r6
-; P9LE-NEXT: rldicl r5, r5, 32, 32
+; P9LE-NEXT: mulhwu r5, r4, r5
; P9LE-NEXT: subf r4, r5, r4
; P9LE-NEXT: srwi r4, r4, 1
; P9LE-NEXT: add r4, r4, r5
; P9LE-NEXT: srwi r4, r4, 6
; P9LE-NEXT: mulli r4, r4, 95
; P9LE-NEXT: subf r3, r4, r3
+; P9LE-NEXT: vmrglh v3, v4, v3
+; P9LE-NEXT: xxswapd v4, vs0
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: xxswapd v2, vs0
; P9LE-NEXT: vmrglh v2, v4, v2
@@ -69,49 +67,45 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: lis r4, 16727
+; P9BE-NEXT: ori r4, r4, 2287
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: lis r5, 16727
-; P9BE-NEXT: ori r5, r5, 2287
-; P9BE-NEXT: clrldi r4, r3, 32
-; P9BE-NEXT: mulld r4, r4, r5
-; P9BE-NEXT: lis r5, 21399
-; P9BE-NEXT: ori r5, r5, 33437
-; P9BE-NEXT: rldicl r4, r4, 24, 40
+; P9BE-NEXT: mulhwu r4, r3, r4
+; P9BE-NEXT: srwi r4, r4, 8
; P9BE-NEXT: mulli r4, r4, 1003
; P9BE-NEXT: subf r3, r4, r3
+; P9BE-NEXT: lis r4, 21399
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: clrldi r4, r3, 32
-; P9BE-NEXT: mulld r4, r4, r5
-; P9BE-NEXT: lis r5, 8456
-; P9BE-NEXT: ori r5, r5, 16913
-; P9BE-NEXT: rldicl r4, r4, 27, 37
+; P9BE-NEXT: ori r4, r4, 33437
+; P9BE-NEXT: mulhwu r4, r3, r4
+; P9BE-NEXT: srwi r4, r4, 5
; P9BE-NEXT: mulli r4, r4, 98
; P9BE-NEXT: subf r3, r4, r3
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: lis r5, 8456
+; P9BE-NEXT: ori r5, r5, 16913
+; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r4, r3, 16
; P9BE-NEXT: rlwinm r3, r3, 30, 18, 31
-; P9BE-NEXT: mulld r3, r3, r5
-; P9BE-NEXT: lis r5, 22765
-; P9BE-NEXT: ori r5, r5, 8969
-; P9BE-NEXT: rldicl r3, r3, 30, 34
+; P9BE-NEXT: mulhwu r3, r3, r5
+; P9BE-NEXT: srwi r3, r3, 2
; P9BE-NEXT: mulli r3, r3, 124
; P9BE-NEXT: subf r3, r3, r4
+; P9BE-NEXT: lis r4, 22765
; P9BE-NEXT: sldi r3, r3, 48
-; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: clrldi r4, r3, 32
-; P9BE-NEXT: mulld r4, r4, r5
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: ori r4, r4, 8969
+; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: subf r5, r4, r3
; P9BE-NEXT: srwi r5, r5, 1
; P9BE-NEXT: add r4, r5, r4
@@ -128,45 +122,43 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P8LE: # %bb.0:
; P8LE-NEXT: xxswapd vs0, v2
; P8LE-NEXT: lis r3, 22765
-; P8LE-NEXT: lis r8, 21399
+; P8LE-NEXT: lis r7, 21399
+; P8LE-NEXT: lis r10, 16727
; P8LE-NEXT: ori r3, r3, 8969
-; P8LE-NEXT: ori r8, r8, 33437
+; P8LE-NEXT: ori r7, r7, 33437
+; P8LE-NEXT: ori r10, r10, 2287
; P8LE-NEXT: mffprd r4, f0
-; P8LE-NEXT: clrldi r5, r4, 48
-; P8LE-NEXT: rldicl r9, r4, 32, 48
-; P8LE-NEXT: clrlwi r6, r5, 16
-; P8LE-NEXT: rldicl r10, r4, 16, 48
-; P8LE-NEXT: rlwinm r11, r9, 0, 16, 31
-; P8LE-NEXT: clrldi r7, r6, 32
-; P8LE-NEXT: rlwinm r12, r10, 0, 16, 31
-; P8LE-NEXT: mulld r3, r7, r3
-; P8LE-NEXT: lis r7, 16727
-; P8LE-NEXT: ori r7, r7, 2287
-; P8LE-NEXT: mulld r8, r11, r8
+; P8LE-NEXT: clrldi r6, r4, 48
+; P8LE-NEXT: rldicl r5, r4, 32, 48
+; P8LE-NEXT: clrlwi r9, r6, 16
+; P8LE-NEXT: rldicl r8, r4, 16, 48
+; P8LE-NEXT: clrlwi r11, r5, 16
+; P8LE-NEXT: mulhwu r3, r9, r3
+; P8LE-NEXT: clrlwi r12, r8, 16
+; P8LE-NEXT: mulhwu r7, r11, r7
; P8LE-NEXT: lis r11, 8456
; P8LE-NEXT: rldicl r4, r4, 48, 48
-; P8LE-NEXT: mulld r7, r12, r7
+; P8LE-NEXT: mulhwu r10, r12, r10
; P8LE-NEXT: ori r11, r11, 16913
; P8LE-NEXT: rlwinm r12, r4, 30, 18, 31
-; P8LE-NEXT: rldicl r3, r3, 32, 32
-; P8LE-NEXT: mulld r11, r12, r11
-; P8LE-NEXT: subf r6, r3, r6
-; P8LE-NEXT: rldicl r8, r8, 27, 37
-; P8LE-NEXT: srwi r6, r6, 1
-; P8LE-NEXT: add r3, r6, r3
-; P8LE-NEXT: rldicl r6, r7, 24, 40
-; P8LE-NEXT: mulli r7, r8, 98
+; P8LE-NEXT: mulhwu r11, r12, r11
+; P8LE-NEXT: subf r9, r3, r9
+; P8LE-NEXT: srwi r9, r9, 1
+; P8LE-NEXT: srwi r7, r7, 5
+; P8LE-NEXT: add r3, r9, r3
+; P8LE-NEXT: srwi r9, r10, 8
; P8LE-NEXT: srwi r3, r3, 6
-; P8LE-NEXT: rldicl r8, r11, 30, 34
-; P8LE-NEXT: mulli r6, r6, 1003
+; P8LE-NEXT: mulli r7, r7, 98
+; P8LE-NEXT: srwi r10, r11, 2
+; P8LE-NEXT: mulli r9, r9, 1003
; P8LE-NEXT: mulli r3, r3, 95
-; P8LE-NEXT: mulli r8, r8, 124
-; P8LE-NEXT: subf r7, r7, r9
-; P8LE-NEXT: subf r6, r6, r10
-; P8LE-NEXT: mtfprd f0, r7
-; P8LE-NEXT: subf r3, r3, r5
-; P8LE-NEXT: subf r4, r8, r4
-; P8LE-NEXT: mtfprd f1, r6
+; P8LE-NEXT: mulli r10, r10, 124
+; P8LE-NEXT: subf r5, r7, r5
+; P8LE-NEXT: subf r7, r9, r8
+; P8LE-NEXT: mtfprd f0, r5
+; P8LE-NEXT: subf r3, r3, r6
+; P8LE-NEXT: subf r4, r10, r4
+; P8LE-NEXT: mtfprd f1, r7
; P8LE-NEXT: mtfprd f2, r3
; P8LE-NEXT: xxswapd v2, vs0
; P8LE-NEXT: mtfprd f3, r4
@@ -182,47 +174,43 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) {
; P8BE: # %bb.0:
; P8BE-NEXT: mfvsrd r4, v2
; P8BE-NEXT: lis r3, 22765
-; P8BE-NEXT: lis r9, 16727
+; P8BE-NEXT: lis r7, 16727
+; P8BE-NEXT: lis r9, 21399
+; P8BE-NEXT: lis r10, 8456
; P8BE-NEXT: ori r3, r3, 8969
-; P8BE-NEXT: ori r9, r9, 2287
-; P8BE-NEXT: rldicl r5, r4, 16, 48
-; P8BE-NEXT: clrldi r6, r4, 48
-; P8BE-NEXT: clrlwi r5, r5, 16
-; P8BE-NEXT: rldicl r7, r4, 48, 48
+; P8BE-NEXT: ori r7, r7, 2287
+; P8BE-NEXT: ori r9, r9, 33437
+; P8BE-NEXT: ori r10, r10, 16913
+; P8BE-NEXT: rldicl r6, r4, 16, 48
+; P8BE-NEXT: clrldi r5, r4, 48
; P8BE-NEXT: clrlwi r6, r6, 16
-; P8BE-NEXT: clrldi r8, r5, 32
-; P8BE-NEXT: clrlwi r7, r7, 16
-; P8BE-NEXT: mulld r3, r8, r3
-; P8BE-NEXT: lis r8, 21399
-; P8BE-NEXT: clrldi r10, r6, 32
-; P8BE-NEXT: ori r8, r8, 33437
-; P8BE-NEXT: clrldi r11, r7, 32
-; P8BE-NEXT: mulld r9, r10, r9
-; P8BE-NEXT: lis r10, 8456
+; P8BE-NEXT: rldicl r8, r4, 48, 48
+; P8BE-NEXT: clrlwi r5, r5, 16
+; P8BE-NEXT: mulhwu r3, r6, r3
; P8BE-NEXT: rldicl r4, r4, 32, 48
-; P8BE-NEXT: mulld r8, r11, r8
-; P8BE-NEXT: ori r10, r10, 16913
+; P8BE-NEXT: clrlwi r8, r8, 16
+; P8BE-NEXT: mulhwu r7, r5, r7
; P8BE-NEXT: rlwinm r11, r4, 30, 18, 31
-; P8BE-NEXT: rldicl r3, r3, 32, 32
; P8BE-NEXT: clrlwi r4, r4, 16
-; P8BE-NEXT: mulld r10, r11, r10
-; P8BE-NEXT: subf r11, r3, r5
+; P8BE-NEXT: mulhwu r9, r8, r9
+; P8BE-NEXT: mulhwu r10, r11, r10
+; P8BE-NEXT: subf r11, r3, r6
; P8BE-NEXT: srwi r11, r11, 1
-; P8BE-NEXT: rldicl r9, r9, 24, 40
+; P8BE-NEXT: srwi r7, r7, 8
; P8BE-NEXT: add r3, r11, r3
-; P8BE-NEXT: rldicl r8, r8, 27, 37
+; P8BE-NEXT: srwi r9, r9, 5
+; P8BE-NEXT: srwi r10, r10, 2
+; P8BE-NEXT: mulli r7, r7, 1003
; P8BE-NEXT: srwi r3, r3, 6
-; P8BE-NEXT: mulli r9, r9, 1003
-; P8BE-NEXT: rldicl r10, r10, 30, 34
-; P8BE-NEXT: mulli r8, r8, 98
+; P8BE-NEXT: mulli r9, r9, 98
; P8BE-NEXT: mulli r3, r3, 95
; P8BE-NEXT: mulli r10, r10, 124
-; P8BE-NEXT: subf r6, r9, r6
-; P8BE-NEXT: subf r7, r8, r7
-; P8BE-NEXT: sldi r6, r6, 48
-; P8BE-NEXT: subf r3, r3, r5
+; P8BE-NEXT: subf r5, r7, r5
+; P8BE-NEXT: subf r7, r9, r8
+; P8BE-NEXT: sldi r5, r5, 48
+; P8BE-NEXT: subf r3, r3, r6
; P8BE-NEXT: subf r4, r10, r4
-; P8BE-NEXT: mtvsrd v2, r6
+; P8BE-NEXT: mtvsrd v2, r5
; P8BE-NEXT: sldi r5, r7, 48
; P8BE-NEXT: sldi r3, r3, 48
; P8BE-NEXT: sldi r4, r4, 48
@@ -242,15 +230,13 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: lis r5, 22765
+; P9LE-NEXT: ori r5, r5, 8969
; P9LE-NEXT: clrlwi r4, r3, 16
-; P9LE-NEXT: lis r6, 22765
-; P9LE-NEXT: ori r6, r6, 8969
-; P9LE-NEXT: clrldi r5, r4, 32
-; P9LE-NEXT: mulld r5, r5, r6
-; P9LE-NEXT: rldicl r5, r5, 32, 32
-; P9LE-NEXT: subf r4, r5, r4
+; P9LE-NEXT: mulhwu r6, r4, r5
+; P9LE-NEXT: subf r4, r6, r4
; P9LE-NEXT: srwi r4, r4, 1
-; P9LE-NEXT: add r4, r4, r5
+; P9LE-NEXT: add r4, r4, r6
; P9LE-NEXT: srwi r4, r4, 6
; P9LE-NEXT: mulli r4, r4, 95
; P9LE-NEXT: subf r3, r4, r3
@@ -258,12 +244,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r4, r3, 16
-; P9LE-NEXT: clrldi r5, r4, 32
-; P9LE-NEXT: mulld r5, r5, r6
-; P9LE-NEXT: rldicl r5, r5, 32, 32
-; P9LE-NEXT: subf r4, r5, r4
+; P9LE-NEXT: mulhwu r6, r4, r5
+; P9LE-NEXT: subf r4, r6, r4
; P9LE-NEXT: srwi r4, r4, 1
-; P9LE-NEXT: add r4, r4, r5
+; P9LE-NEXT: add r4, r4, r6
; P9LE-NEXT: srwi r4, r4, 6
; P9LE-NEXT: mulli r4, r4, 95
; P9LE-NEXT: subf r3, r4, r3
@@ -272,12 +256,10 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r4, r3, 16
-; P9LE-NEXT: clrldi r5, r4, 32
-; P9LE-NEXT: mulld r5, r5, r6
-; P9LE-NEXT: rldicl r5, r5, 32, 32
-; P9LE-NEXT: subf r4, r5, r4
+; P9LE-NEXT: mulhwu r6, r4, r5
+; P9LE-NEXT: subf r4, r6, r4
; P9LE-NEXT: srwi r4, r4, 1
-; P9LE-NEXT: add r4, r4, r5
+; P9LE-NEXT: add r4, r4, r6
; P9LE-NEXT: srwi r4, r4, 6
; P9LE-NEXT: mulli r4, r4, 95
; P9LE-NEXT: subf r3, r4, r3
@@ -286,9 +268,7 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r4, r3, 16
-; P9LE-NEXT: clrldi r5, r4, 32
-; P9LE-NEXT: mulld r5, r5, r6
-; P9LE-NEXT: rldicl r5, r5, 32, 32
+; P9LE-NEXT: mulhwu r5, r4, r5
; P9LE-NEXT: subf r4, r5, r4
; P9LE-NEXT: srwi r4, r4, 1
; P9LE-NEXT: add r4, r4, r5
@@ -307,55 +287,47 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: lis r4, 22765
+; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: lis r5, 22765
-; P9BE-NEXT: ori r5, r5, 8969
-; P9BE-NEXT: clrldi r4, r3, 32
-; P9BE-NEXT: mulld r4, r4, r5
-; P9BE-NEXT: rldicl r4, r4, 32, 32
-; P9BE-NEXT: subf r6, r4, r3
+; P9BE-NEXT: mulhwu r5, r3, r4
+; P9BE-NEXT: subf r6, r5, r3
; P9BE-NEXT: srwi r6, r6, 1
-; P9BE-NEXT: add r4, r6, r4
-; P9BE-NEXT: srwi r4, r4, 6
-; P9BE-NEXT: mulli r4, r4, 95
-; P9BE-NEXT: subf r3, r4, r3
+; P9BE-NEXT: add r5, r6, r5
+; P9BE-NEXT: srwi r5, r5, 6
+; P9BE-NEXT: mulli r5, r5, 95
+; P9BE-NEXT: subf r3, r5, r3
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: clrldi r4, r3, 32
-; P9BE-NEXT: mulld r4, r4, r5
-; P9BE-NEXT: rldicl r4, r4, 32, 32
-; P9BE-NEXT: subf r6, r4, r3
+; P9BE-NEXT: mulhwu r5, r3, r4
+; P9BE-NEXT: subf r6, r5, r3
; P9BE-NEXT: srwi r6, r6, 1
-; P9BE-NEXT: add r4, r6, r4
-; P9BE-NEXT: srwi r4, r4, 6
-; P9BE-NEXT: mulli r4, r4, 95
-; P9BE-NEXT: subf r3, r4, r3
+; P9BE-NEXT: add r5, r6, r5
+; P9BE-NEXT: srwi r5, r5, 6
+; P9BE-NEXT: mulli r5, r5, 95
+; P9BE-NEXT: subf r3, r5, r3
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: clrldi r4, r3, 32
-; P9BE-NEXT: mulld r4, r4, r5
-; P9BE-NEXT: rldicl r4, r4, 32, 32
-; P9BE-NEXT: subf r6, r4, r3
+; P9BE-NEXT: mulhwu r5, r3, r4
+; P9BE-NEXT: subf r6, r5, r3
; P9BE-NEXT: srwi r6, r6, 1
-; P9BE-NEXT: add r4, r6, r4
-; P9BE-NEXT: srwi r4, r4, 6
-; P9BE-NEXT: mulli r4, r4, 95
-; P9BE-NEXT: subf r3, r4, r3
+; P9BE-NEXT: add r5, r6, r5
+; P9BE-NEXT: srwi r5, r5, 6
+; P9BE-NEXT: mulli r5, r5, 95
+; P9BE-NEXT: subf r3, r5, r3
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: clrldi r4, r3, 32
-; P9BE-NEXT: mulld r4, r4, r5
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: subf r5, r4, r3
; P9BE-NEXT: srwi r5, r5, 1
; P9BE-NEXT: add r4, r5, r4
@@ -371,62 +343,52 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P8LE-LABEL: fold_urem_vec_2:
; P8LE: # %bb.0:
; P8LE-NEXT: xxswapd vs0, v2
-; P8LE-NEXT: lis r4, 22765
-; P8LE-NEXT: std r29, -24(r1) # 8-byte Folded Spill
+; P8LE-NEXT: lis r3, 22765
; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT: ori r4, r4, 8969
-; P8LE-NEXT: mffprd r5, f0
-; P8LE-NEXT: clrldi r3, r5, 48
-; P8LE-NEXT: rldicl r6, r5, 48, 48
-; P8LE-NEXT: clrlwi r8, r3, 16
-; P8LE-NEXT: rldicl r7, r5, 32, 48
+; P8LE-NEXT: ori r3, r3, 8969
+; P8LE-NEXT: mffprd r4, f0
+; P8LE-NEXT: clrldi r5, r4, 48
+; P8LE-NEXT: rldicl r6, r4, 48, 48
+; P8LE-NEXT: clrlwi r8, r5, 16
+; P8LE-NEXT: rldicl r7, r4, 32, 48
; P8LE-NEXT: clrlwi r9, r6, 16
-; P8LE-NEXT: rldicl r5, r5, 16, 48
-; P8LE-NEXT: clrldi r11, r8, 32
-; P8LE-NEXT: clrlwi r10, r7, 16
-; P8LE-NEXT: clrlwi r12, r5, 16
-; P8LE-NEXT: mulld r11, r11, r4
-; P8LE-NEXT: clrldi r0, r9, 32
-; P8LE-NEXT: clrldi r30, r10, 32
-; P8LE-NEXT: clrldi r29, r12, 32
-; P8LE-NEXT: mulld r0, r0, r4
-; P8LE-NEXT: mulld r30, r30, r4
-; P8LE-NEXT: mulld r4, r29, r4
-; P8LE-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
-; P8LE-NEXT: rldicl r11, r11, 32, 32
-; P8LE-NEXT: subf r8, r11, r8
-; P8LE-NEXT: rldicl r0, r0, 32, 32
+; P8LE-NEXT: rldicl r4, r4, 16, 48
+; P8LE-NEXT: mulhwu r10, r8, r3
+; P8LE-NEXT: clrlwi r11, r7, 16
+; P8LE-NEXT: clrlwi r0, r4, 16
+; P8LE-NEXT: mulhwu r12, r9, r3
+; P8LE-NEXT: mulhwu r30, r11, r3
+; P8LE-NEXT: mulhwu r3, r0, r3
+; P8LE-NEXT: subf r8, r10, r8
; P8LE-NEXT: srwi r8, r8, 1
-; P8LE-NEXT: rldicl r30, r30, 32, 32
-; P8LE-NEXT: rldicl r4, r4, 32, 32
-; P8LE-NEXT: subf r9, r0, r9
-; P8LE-NEXT: add r8, r8, r11
-; P8LE-NEXT: subf r10, r30, r10
-; P8LE-NEXT: subf r11, r4, r12
+; P8LE-NEXT: subf r9, r12, r9
+; P8LE-NEXT: add r8, r8, r10
+; P8LE-NEXT: subf r10, r30, r11
+; P8LE-NEXT: subf r11, r3, r0
; P8LE-NEXT: srwi r9, r9, 1
-; P8LE-NEXT: srwi r8, r8, 6
; P8LE-NEXT: srwi r10, r10, 1
; P8LE-NEXT: srwi r11, r11, 1
-; P8LE-NEXT: add r9, r9, r0
+; P8LE-NEXT: add r9, r9, r12
+; P8LE-NEXT: srwi r8, r8, 6
; P8LE-NEXT: add r10, r10, r30
-; P8LE-NEXT: add r4, r11, r4
+; P8LE-NEXT: add r3, r11, r3
; P8LE-NEXT: srwi r9, r9, 6
; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
; P8LE-NEXT: mulli r8, r8, 95
; P8LE-NEXT: srwi r10, r10, 6
-; P8LE-NEXT: srwi r4, r4, 6
+; P8LE-NEXT: srwi r3, r3, 6
; P8LE-NEXT: mulli r9, r9, 95
; P8LE-NEXT: mulli r10, r10, 95
-; P8LE-NEXT: mulli r4, r4, 95
-; P8LE-NEXT: subf r3, r8, r3
+; P8LE-NEXT: mulli r3, r3, 95
+; P8LE-NEXT: subf r5, r8, r5
; P8LE-NEXT: subf r6, r9, r6
-; P8LE-NEXT: mtfprd f0, r3
-; P8LE-NEXT: subf r3, r10, r7
-; P8LE-NEXT: subf r4, r4, r5
+; P8LE-NEXT: mtfprd f0, r5
+; P8LE-NEXT: subf r5, r10, r7
+; P8LE-NEXT: subf r3, r3, r4
; P8LE-NEXT: mtfprd f1, r6
-; P8LE-NEXT: mtfprd f2, r3
+; P8LE-NEXT: mtfprd f2, r5
; P8LE-NEXT: xxswapd v2, vs0
-; P8LE-NEXT: mtfprd f3, r4
+; P8LE-NEXT: mtfprd f3, r3
; P8LE-NEXT: xxswapd v3, vs1
; P8LE-NEXT: xxswapd v4, vs2
; P8LE-NEXT: xxswapd v5, vs3
@@ -445,24 +407,16 @@ define <4 x i16> @fold_urem_vec_2(<4 x i16> %x) {
; P8BE-NEXT: clrlwi r5, r5, 16
; P8BE-NEXT: rldicl r7, r4, 32, 48
; P8BE-NEXT: clrlwi r6, r6, 16
-; P8BE-NEXT: clrldi r8, r5, 32
+; P8BE-NEXT: mulhwu r8, r5, r3
; P8BE-NEXT: rldicl r4, r4, 16, 48
; P8BE-NEXT: clrlwi r7, r7, 16
-; P8BE-NEXT: clrldi r9, r6, 32
-; P8BE-NEXT: mulld r8, r8, r3
+; P8BE-NEXT: mulhwu r9, r6, r3
; P8BE-NEXT: clrlwi r4, r4, 16
-; P8BE-NEXT: clrldi r10, r7, 32
-; P8BE-NEXT: mulld r9, r9, r3
-; P8BE-NEXT: clrldi r11, r4, 32
-; P8BE-NEXT: mulld r10, r10, r3
-; P8BE-NEXT: mulld r3, r11, r3
-; P8BE-NEXT: rldicl r8, r8, 32, 32
-; P8BE-NEXT: rldicl r9, r9, 32, 32
+; P8BE-NEXT: mulhwu r10, r7, r3
+; P8BE-NEXT: mulhwu r3, r4, r3
; P8BE-NEXT: subf r11, r8, r5
-; P8BE-NEXT: rldicl r10, r10, 32, 32
; P8BE-NEXT: subf r12, r9, r6
; P8BE-NEXT: srwi r11, r11, 1
-; P8BE-NEXT: rldicl r3, r3, 32, 32
; P8BE-NEXT: add r8, r11, r8
; P8BE-NEXT: subf r11, r10, r7
; P8BE-NEXT: srwi r12, r12, 1
@@ -507,39 +461,33 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
+; P9LE-NEXT: lis r5, 22765
+; P9LE-NEXT: ori r5, r5, 8969
; P9LE-NEXT: clrlwi r4, r3, 16
-; P9LE-NEXT: lis r6, 22765
-; P9LE-NEXT: ori r6, r6, 8969
-; P9LE-NEXT: clrldi r5, r4, 32
-; P9LE-NEXT: mulld r5, r5, r6
-; P9LE-NEXT: rldicl r5, r5, 32, 32
-; P9LE-NEXT: subf r4, r5, r4
+; P9LE-NEXT: mulhwu r6, r4, r5
+; P9LE-NEXT: subf r4, r6, r4
; P9LE-NEXT: srwi r4, r4, 1
-; P9LE-NEXT: add r4, r4, r5
+; P9LE-NEXT: add r4, r4, r6
; P9LE-NEXT: srwi r4, r4, 6
-; P9LE-NEXT: mulli r5, r4, 95
-; P9LE-NEXT: subf r3, r5, r3
+; P9LE-NEXT: mulli r6, r4, 95
+; P9LE-NEXT: subf r3, r6, r3
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: clrlwi r5, r3, 16
-; P9LE-NEXT: clrldi r7, r5, 32
-; P9LE-NEXT: mulld r7, r7, r6
-; P9LE-NEXT: rldicl r7, r7, 32, 32
-; P9LE-NEXT: subf r5, r7, r5
-; P9LE-NEXT: srwi r5, r5, 1
-; P9LE-NEXT: add r5, r5, r7
-; P9LE-NEXT: srwi r5, r5, 6
-; P9LE-NEXT: mulli r7, r5, 95
+; P9LE-NEXT: clrlwi r6, r3, 16
+; P9LE-NEXT: mulhwu r7, r6, r5
+; P9LE-NEXT: subf r6, r7, r6
+; P9LE-NEXT: srwi r6, r6, 1
+; P9LE-NEXT: add r6, r6, r7
+; P9LE-NEXT: srwi r6, r6, 6
+; P9LE-NEXT: mulli r7, r6, 95
; P9LE-NEXT: subf r3, r7, r3
; P9LE-NEXT: xxswapd v3, vs0
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r7, r3, 16
-; P9LE-NEXT: clrldi r8, r7, 32
-; P9LE-NEXT: mulld r8, r8, r6
-; P9LE-NEXT: rldicl r8, r8, 32, 32
+; P9LE-NEXT: mulhwu r8, r7, r5
; P9LE-NEXT: subf r7, r8, r7
; P9LE-NEXT: srwi r7, r7, 1
; P9LE-NEXT: add r7, r7, r8
@@ -551,14 +499,12 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r8, r3, 16
-; P9LE-NEXT: clrldi r9, r8, 32
-; P9LE-NEXT: mulld r6, r9, r6
-; P9LE-NEXT: rldicl r6, r6, 32, 32
-; P9LE-NEXT: subf r8, r6, r8
+; P9LE-NEXT: mulhwu r5, r8, r5
+; P9LE-NEXT: subf r8, r5, r8
; P9LE-NEXT: srwi r8, r8, 1
-; P9LE-NEXT: add r6, r8, r6
-; P9LE-NEXT: srwi r6, r6, 6
-; P9LE-NEXT: mulli r8, r6, 95
+; P9LE-NEXT: add r5, r8, r5
+; P9LE-NEXT: srwi r5, r5, 6
+; P9LE-NEXT: mulli r8, r5, 95
; P9LE-NEXT: subf r3, r8, r3
; P9LE-NEXT: vmrglh v3, v4, v3
; P9LE-NEXT: xxswapd v4, vs0
@@ -568,12 +514,12 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9LE-NEXT: vmrglh v2, v2, v4
; P9LE-NEXT: vmrglw v2, v2, v3
; P9LE-NEXT: xxswapd v3, vs0
-; P9LE-NEXT: mtfprd f0, r5
+; P9LE-NEXT: mtfprd f0, r6
; P9LE-NEXT: xxswapd v4, vs0
; P9LE-NEXT: mtfprd f0, r7
; P9LE-NEXT: vmrglh v3, v4, v3
; P9LE-NEXT: xxswapd v4, vs0
-; P9LE-NEXT: mtfprd f0, r6
+; P9LE-NEXT: mtfprd f0, r5
; P9LE-NEXT: xxswapd v5, vs0
; P9LE-NEXT: vmrglh v4, v5, v4
; P9LE-NEXT: vmrglw v3, v4, v3
@@ -584,40 +530,34 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: lis r5, 22765
+; P9BE-NEXT: ori r5, r5, 8969
; P9BE-NEXT: clrlwi r4, r3, 16
-; P9BE-NEXT: lis r6, 22765
-; P9BE-NEXT: ori r6, r6, 8969
-; P9BE-NEXT: clrldi r5, r4, 32
-; P9BE-NEXT: mulld r5, r5, r6
-; P9BE-NEXT: rldicl r5, r5, 32, 32
-; P9BE-NEXT: subf r4, r5, r4
+; P9BE-NEXT: mulhwu r6, r4, r5
+; P9BE-NEXT: subf r4, r6, r4
; P9BE-NEXT: srwi r4, r4, 1
-; P9BE-NEXT: add r4, r4, r5
+; P9BE-NEXT: add r4, r4, r6
; P9BE-NEXT: srwi r4, r4, 6
-; P9BE-NEXT: mulli r5, r4, 95
-; P9BE-NEXT: subf r3, r5, r3
+; P9BE-NEXT: mulli r6, r4, 95
+; P9BE-NEXT: subf r3, r6, r3
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
-; P9BE-NEXT: clrlwi r5, r3, 16
-; P9BE-NEXT: clrldi r7, r5, 32
-; P9BE-NEXT: mulld r7, r7, r6
-; P9BE-NEXT: rldicl r7, r7, 32, 32
-; P9BE-NEXT: subf r5, r7, r5
-; P9BE-NEXT: srwi r5, r5, 1
-; P9BE-NEXT: add r5, r5, r7
-; P9BE-NEXT: srwi r5, r5, 6
-; P9BE-NEXT: mulli r7, r5, 95
+; P9BE-NEXT: clrlwi r6, r3, 16
+; P9BE-NEXT: mulhwu r7, r6, r5
+; P9BE-NEXT: subf r6, r7, r6
+; P9BE-NEXT: srwi r6, r6, 1
+; P9BE-NEXT: add r6, r6, r7
+; P9BE-NEXT: srwi r6, r6, 6
+; P9BE-NEXT: mulli r7, r6, 95
; P9BE-NEXT: subf r3, r7, r3
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r7, r3, 16
-; P9BE-NEXT: clrldi r8, r7, 32
-; P9BE-NEXT: mulld r8, r8, r6
-; P9BE-NEXT: rldicl r8, r8, 32, 32
+; P9BE-NEXT: mulhwu r8, r7, r5
; P9BE-NEXT: subf r7, r8, r7
; P9BE-NEXT: srwi r7, r7, 1
; P9BE-NEXT: add r7, r7, r8
@@ -630,14 +570,12 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: clrldi r8, r3, 32
-; P9BE-NEXT: mulld r6, r8, r6
-; P9BE-NEXT: rldicl r6, r6, 32, 32
-; P9BE-NEXT: subf r8, r6, r3
+; P9BE-NEXT: mulhwu r5, r3, r5
+; P9BE-NEXT: subf r8, r5, r3
; P9BE-NEXT: srwi r8, r8, 1
-; P9BE-NEXT: add r6, r8, r6
-; P9BE-NEXT: srwi r6, r6, 6
-; P9BE-NEXT: mulli r8, r6, 95
+; P9BE-NEXT: add r5, r8, r5
+; P9BE-NEXT: srwi r5, r5, 6
+; P9BE-NEXT: mulli r8, r5, 95
; P9BE-NEXT: subf r3, r8, r3
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v2, r3
@@ -645,12 +583,12 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P9BE-NEXT: vmrghh v2, v2, v4
; P9BE-NEXT: vmrghw v2, v2, v3
; P9BE-NEXT: mtvsrd v3, r3
-; P9BE-NEXT: sldi r3, r5, 48
+; P9BE-NEXT: sldi r3, r6, 48
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: sldi r3, r7, 48
; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
-; P9BE-NEXT: sldi r3, r6, 48
+; P9BE-NEXT: sldi r3, r5, 48
; P9BE-NEXT: mtvsrd v5, r3
; P9BE-NEXT: vmrghh v4, v5, v4
; P9BE-NEXT: vmrghw v3, v4, v3
@@ -660,68 +598,58 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
; P8LE-LABEL: combine_urem_udiv:
; P8LE: # %bb.0:
; P8LE-NEXT: xxswapd vs0, v2
-; P8LE-NEXT: lis r5, 22765
+; P8LE-NEXT: lis r4, 22765
; P8LE-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; P8LE-NEXT: std r29, -24(r1) # 8-byte Folded Spill
-; P8LE-NEXT: ori r5, r5, 8969
-; P8LE-NEXT: mffprd r6, f0
-; P8LE-NEXT: clrldi r3, r6, 48
-; P8LE-NEXT: rldicl r4, r6, 48, 48
-; P8LE-NEXT: rldicl r7, r6, 32, 48
+; P8LE-NEXT: ori r4, r4, 8969
+; P8LE-NEXT: mffprd r5, f0
+; P8LE-NEXT: clrldi r3, r5, 48
+; P8LE-NEXT: rldicl r6, r5, 48, 48
; P8LE-NEXT: clrlwi r8, r3, 16
-; P8LE-NEXT: clrlwi r9, r4, 16
-; P8LE-NEXT: rldicl r6, r6, 16, 48
-; P8LE-NEXT: clrlwi r10, r7, 16
-; P8LE-NEXT: clrldi r11, r8, 32
-; P8LE-NEXT: clrlwi r12, r6, 16
-; P8LE-NEXT: clrldi r0, r9, 32
-; P8LE-NEXT: clrldi r30, r10, 32
-; P8LE-NEXT: mulld r11, r11, r5
-; P8LE-NEXT: clrldi r29, r12, 32
-; P8LE-NEXT: mulld r0, r0, r5
-; P8LE-NEXT: mulld r30, r30, r5
-; P8LE-NEXT: mulld r5, r29, r5
-; P8LE-NEXT: ld r29, -24(r1) # 8-byte Folded Reload
-; P8LE-NEXT: rldicl r11, r11, 32, 32
-; P8LE-NEXT: rldicl r0, r0, 32, 32
-; P8LE-NEXT: rldicl r30, r30, 32, 32
-; P8LE-NEXT: subf r8, r11, r8
-; P8LE-NEXT: rldicl r5, r5, 32, 32
-; P8LE-NEXT: subf r9, r0, r9
+; P8LE-NEXT: rldicl r7, r5, 32, 48
+; P8LE-NEXT: clrlwi r9, r6, 16
+; P8LE-NEXT: mulhwu r10, r8, r4
+; P8LE-NEXT: clrlwi r11, r7, 16
+; P8LE-NEXT: rldicl r5, r5, 16, 48
+; P8LE-NEXT: mulhwu r12, r9, r4
+; P8LE-NEXT: mulhwu r0, r11, r4
+; P8LE-NEXT: clrlwi r30, r5, 16
+; P8LE-NEXT: mulhwu r4, r30, r4
+; P8LE-NEXT: subf r8, r10, r8
; P8LE-NEXT: srwi r8, r8, 1
-; P8LE-NEXT: subf r10, r30, r10
-; P8LE-NEXT: add r8, r8, r11
+; P8LE-NEXT: subf r9, r12, r9
+; P8LE-NEXT: add r8, r8, r10
+; P8LE-NEXT: subf r10, r0, r11
; P8LE-NEXT: srwi r9, r9, 1
; P8LE-NEXT: srwi r10, r10, 1
-; P8LE-NEXT: subf r11, r5, r12
-; P8LE-NEXT: add r9, r9, r0
+; P8LE-NEXT: subf r11, r4, r30
+; P8LE-NEXT: add r9, r9, r12
; P8LE-NEXT: srwi r8, r8, 6
-; P8LE-NEXT: add r10, r10, r30
-; P8LE-NEXT: srwi r11, r11, 1
; P8LE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
+; P8LE-NEXT: add r10, r10, r0
+; P8LE-NEXT: srwi r11, r11, 1
; P8LE-NEXT: srwi r9, r9, 6
+; P8LE-NEXT: mtfprd f0, r8
; P8LE-NEXT: mulli r12, r8, 95
; P8LE-NEXT: srwi r10, r10, 6
-; P8LE-NEXT: add r5, r11, r5
-; P8LE-NEXT: mtfprd f0, r8
-; P8LE-NEXT: mulli r8, r9, 95
+; P8LE-NEXT: add r4, r11, r4
; P8LE-NEXT: mtfprd f1, r9
+; P8LE-NEXT: mulli r8, r9, 95
; P8LE-NEXT: mulli r9, r10, 95
-; P8LE-NEXT: srwi r5, r5, 6
-; P8LE-NEXT: mtfprd f3, r5
-; P8LE-NEXT: mulli r5, r5, 95
+; P8LE-NEXT: srwi r4, r4, 6
; P8LE-NEXT: xxswapd v2, vs0
-; P8LE-NEXT: xxswapd v3, vs1
; P8LE-NEXT: mtfprd f2, r10
+; P8LE-NEXT: mtfprd f3, r4
+; P8LE-NEXT: mulli r4, r4, 95
+; P8LE-NEXT: xxswapd v3, vs1
+; P8LE-NEXT: xxswapd v1, vs2
; P8LE-NEXT: subf r3, r12, r3
; P8LE-NEXT: xxswapd v6, vs3
; P8LE-NEXT: mtfprd f0, r3
; P8LE-NEXT: subf r3, r9, r7
-; P8LE-NEXT: subf r4, r8, r4
-; P8LE-NEXT: xxswapd v1, vs2
+; P8LE-NEXT: subf r6, r8, r6
; P8LE-NEXT: mtfprd f4, r3
-; P8LE-NEXT: subf r3, r5, r6
-; P8LE-NEXT: mtfprd f1, r4
+; P8LE-NEXT: subf r3, r4, r5
+; P8LE-NEXT: mtfprd f1, r6
; P8LE-NEXT: mtfprd f5, r3
; P8LE-NEXT: xxswapd v5, vs4
; P8LE-NEXT: vmrglh v2, v3, v2
@@ -738,71 +666,61 @@ define <4 x i16> @combine_urem_udiv(<4 x i16> %x) {
;
; P8BE-LABEL: combine_urem_udiv:
; P8BE: # %bb.0:
-; P8BE-NEXT: mfvsrd r6, v2
-; P8BE-NEXT: lis r5, 22765
-; P8BE-NEXT: std r30, -16(r1) # 8-byte Folded Spill
-; P8BE-NEXT: ori r5, r5, 8969
-; P8BE-NEXT: clrldi r3, r6, 48
-; P8BE-NEXT: rldicl r4, r6, 48, 48
+; P8BE-NEXT: mfvsrd r5, v2
+; P8BE-NEXT: lis r4, 22765
+; P8BE-NEXT: ori r4, r4, 8969
+; P8BE-NEXT: clrldi r3, r5, 48
+; P8BE-NEXT: rldicl r6, r5, 48, 48
; P8BE-NEXT: clrlwi r8, r3, 16
-; P8BE-NEXT: rldicl r7, r6, 32, 48
-; P8BE-NEXT: clrlwi r9, r4, 16
-; P8BE-NEXT: rldicl r6, r6, 16, 48
-; P8BE-NEXT: clrldi r11, r8, 32
-; P8BE-NEXT: clrlwi r10, r7, 16
-; P8BE-NEXT: clrlwi r6, r6, 16
-; P8BE-NEXT: clrldi r12, r9, 32
-; P8BE-NEXT: mulld r11, r11, r5
-; P8BE-NEXT: clrldi r0, r10, 32
-; P8BE-NEXT: clrldi r30, r6, 32
-; P8BE-NEXT: mulld r12, r12, r5
-; P8BE-NEXT: mulld r0, r0, r5
-; P8BE-NEXT: mulld r5, r30, r5
-; P8BE-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
-; P8BE-NEXT: rldicl r11, r11, 32, 32
-; P8BE-NEXT: rldicl r12, r12, 32, 32
-; P8BE-NEXT: subf r8, r11, r8
-; P8BE-NEXT: rldicl r5, r5, 32, 32
+; P8BE-NEXT: rldicl r7, r5, 32, 48
+; P8BE-NEXT: clrlwi r9, r6, 16
+; P8BE-NEXT: rldicl r5, r5, 16, 48
+; P8BE-NEXT: mulhwu r10, r8, r4
+; P8BE-NEXT: clrlwi r11, r7, 16
+; P8BE-NEXT: mulhwu r12, r9, r4
+; P8BE-NEXT: clrlwi r5, r5, 16
+; P8BE-NEXT: mulhwu r0, r11, r4
+; P8BE-NEXT: mulhwu r4, r5, r4
+; P8BE-NEXT: subf r8, r10, r8
; P8BE-NEXT: subf r9, r12, r9
; P8BE-NEXT: srwi r8, r8, 1
-; P8BE-NEXT: rldicl r0, r0, 32, 32
-; P8BE-NEXT: add r8, r8, r11
+; P8BE-NEXT: add r8, r8, r10
+; P8BE-NEXT: subf r10, r0, r11
; P8BE-NEXT: srwi r9, r9, 1
-; P8BE-NEXT: subf r11, r5, r6
-; P8BE-NEXT: subf r10, r0, r10
+; P8BE-NEXT: subf r11, r4, r5
; P8BE-NEXT: add r9, r9, r12
; P8BE-NEXT: srwi r8, r8, 6
; P8BE-NEXT: srwi r11, r11, 1
; P8BE-NEXT: srwi r10, r10, 1
; P8BE-NEXT: srwi r9, r9, 6
-; P8BE-NEXT: add r5, r11, r5
; P8BE-NEXT: mulli r12, r8, 95
+; P8BE-NEXT: add r4, r11, r4
; P8BE-NEXT: add r10, r10, r0
-; P8BE-NEXT: srwi r5, r5, 6
; P8BE-NEXT: mulli r11, r9, 95
-; P8BE-NEXT: sldi r9, r9, 48
+; P8BE-NEXT: srwi r4, r4, 6
; P8BE-NEXT: srwi r10, r10, 6
+; P8BE-NEXT: sldi r9, r9, 48
; P8BE-NEXT: sldi r8, r8, 48
; P8BE-NEXT: mtvsrd v3, r9
-; P8BE-NEXT: mulli r9, r5, 95
+; P8BE-NEXT: mulli r9, r4, 95
; P8BE-NEXT: mtvsrd v2, r8
; P8BE-NEXT: mulli r8, r10, 95
-; P8BE-NEXT: sldi r10, r10, 48
; P8BE-NEXT: subf r3, r12, r3
-; P8BE-NEXT: mtvsrd v4, r10
-; P8BE-NEXT: subf r4, r11, r4
+; P8BE-NEXT: subf r6, r11, r6
; P8BE-NEXT: sldi r3, r3, 48
; P8BE-NEXT: vmrghh v2, v3, v2
-; P8BE-NEXT: sldi r4, r4, 48
+; P8BE-NEXT: sldi r6, r6, 48
+; P8BE-NEXT: sldi r10, r10, 48
; P8BE-NEXT: mtvsrd v3, r3
-; P8BE-NEXT: subf r3, r9, r6
+; P8BE-NEXT: subf r3, r9, r5
; P8BE-NEXT: subf r7, r8, r7
-; P8BE-NEXT: mtvsrd v5, r4
+; P8BE-NEXT: mtvsrd v5, r6
; P8BE-NEXT: sldi r3, r3, 48
-; P8BE-NEXT: sldi r6, r7, 48
+; P8BE-NEXT: sldi r5, r7, 48
; P8BE-NEXT: mtvsrd v1, r3
-; P8BE-NEXT: sldi r3, r5, 48
-; P8BE-NEXT: mtvsrd v0, r6
+; P8BE-NEXT: sldi r3, r4, 48
+; P8BE-NEXT: mtvsrd v4, r10
+; P8BE-NEXT: mtvsrd v0, r5
; P8BE-NEXT: vmrghh v3, v5, v3
; P8BE-NEXT: mtvsrd v5, r3
; P8BE-NEXT: vmrghh v0, v1, v0
@@ -832,14 +750,11 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: clrlwi r4, r3, 16
-; P9LE-NEXT: lis r6, 22765
-; P9LE-NEXT: ori r6, r6, 8969
+; P9LE-NEXT: lis r5, 22765
+; P9LE-NEXT: ori r5, r5, 8969
; P9LE-NEXT: xxswapd v4, vs0
-; P9LE-NEXT: vmrglh v3, v4, v3
-; P9LE-NEXT: clrldi r5, r4, 32
-; P9LE-NEXT: mulld r5, r5, r6
-; P9LE-NEXT: rldicl r5, r5, 32, 32
+; P9LE-NEXT: clrlwi r4, r3, 16
+; P9LE-NEXT: mulhwu r5, r4, r5
; P9LE-NEXT: subf r4, r5, r4
; P9LE-NEXT: srwi r4, r4, 1
; P9LE-NEXT: add r4, r4, r5
@@ -850,6 +765,7 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; P9LE-NEXT: li r3, 4
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: clrlwi r3, r3, 29
+; P9LE-NEXT: vmrglh v3, v4, v3
; P9LE-NEXT: xxswapd v4, vs0
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: xxswapd v2, vs0
@@ -871,13 +787,11 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
-; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: lis r5, 22765
-; P9BE-NEXT: ori r5, r5, 8969
+; P9BE-NEXT: lis r4, 22765
+; P9BE-NEXT: ori r4, r4, 8969
; P9BE-NEXT: vmrghh v3, v4, v3
-; P9BE-NEXT: clrldi r4, r3, 32
-; P9BE-NEXT: mulld r4, r4, r5
-; P9BE-NEXT: rldicl r4, r4, 32, 32
+; P9BE-NEXT: clrlwi r3, r3, 16
+; P9BE-NEXT: mulhwu r4, r3, r4
; P9BE-NEXT: subf r5, r4, r3
; P9BE-NEXT: srwi r5, r5, 1
; P9BE-NEXT: add r4, r5, r4
@@ -902,28 +816,26 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; P8LE-NEXT: ori r3, r3, 8969
; P8LE-NEXT: mffprd r4, f0
; P8LE-NEXT: rldicl r5, r4, 16, 48
-; P8LE-NEXT: clrlwi r6, r5, 16
-; P8LE-NEXT: clrldi r7, r6, 32
-; P8LE-NEXT: mulld r3, r7, r3
; P8LE-NEXT: rldicl r7, r4, 48, 48
-; P8LE-NEXT: clrlwi r7, r7, 27
-; P8LE-NEXT: mtfprd f1, r7
-; P8LE-NEXT: rldicl r3, r3, 32, 32
-; P8LE-NEXT: xxswapd v3, vs1
+; P8LE-NEXT: clrlwi r6, r5, 16
+; P8LE-NEXT: mulhwu r3, r6, r3
; P8LE-NEXT: subf r6, r3, r6
; P8LE-NEXT: srwi r6, r6, 1
; P8LE-NEXT: add r3, r6, r3
; P8LE-NEXT: clrldi r6, r4, 48
; P8LE-NEXT: srwi r3, r3, 6
-; P8LE-NEXT: rldicl r4, r4, 32, 48
; P8LE-NEXT: clrlwi r6, r6, 26
; P8LE-NEXT: mulli r3, r3, 95
-; P8LE-NEXT: clrlwi r4, r4, 29
+; P8LE-NEXT: rldicl r4, r4, 32, 48
; P8LE-NEXT: mtfprd f0, r6
+; P8LE-NEXT: clrlwi r6, r7, 27
+; P8LE-NEXT: clrlwi r4, r4, 29
+; P8LE-NEXT: mtfprd f1, r6
; P8LE-NEXT: mtfprd f3, r4
; P8LE-NEXT: xxswapd v2, vs0
-; P8LE-NEXT: xxswapd v5, vs3
+; P8LE-NEXT: xxswapd v3, vs1
; P8LE-NEXT: subf r3, r3, r5
+; P8LE-NEXT: xxswapd v5, vs3
; P8LE-NEXT: mtfprd f2, r3
; P8LE-NEXT: vmrglh v2, v3, v2
; P8LE-NEXT: xxswapd v4, vs2
@@ -940,9 +852,7 @@ define <4 x i16> @dont_fold_urem_power_of_two(<4 x i16> %x) {
; P8BE-NEXT: rldicl r7, r4, 16, 48
; P8BE-NEXT: clrlwi r5, r5, 16
; P8BE-NEXT: clrlwi r7, r7, 26
-; P8BE-NEXT: clrldi r6, r5, 32
-; P8BE-NEXT: mulld r3, r6, r3
-; P8BE-NEXT: rldicl r3, r3, 32, 32
+; P8BE-NEXT: mulhwu r3, r5, r3
; P8BE-NEXT: subf r6, r3, r5
; P8BE-NEXT: srwi r6, r6, 1
; P8BE-NEXT: add r3, r6, r3
@@ -974,25 +884,24 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P9LE-LABEL: dont_fold_urem_one:
; P9LE: # %bb.0:
; P9LE-NEXT: li r3, 4
-; P9LE-NEXT: li r5, 0
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: oris r6, r5, 45590
-; P9LE-NEXT: oris r5, r5, 51306
-; P9LE-NEXT: ori r6, r6, 17097
-; P9LE-NEXT: ori r5, r5, 30865
-; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31
-; P9LE-NEXT: mulld r4, r4, r6
-; P9LE-NEXT: lis r6, 24749
-; P9LE-NEXT: ori r6, r6, 47143
-; P9LE-NEXT: rldicl r4, r4, 28, 36
+; P9LE-NEXT: lis r5, -19946
+; P9LE-NEXT: ori r5, r5, 17097
+; P9LE-NEXT: clrlwi r4, r3, 16
+; P9LE-NEXT: mulhwu r4, r4, r5
+; P9LE-NEXT: lis r5, 24749
+; P9LE-NEXT: ori r5, r5, 47143
+; P9LE-NEXT: srwi r4, r4, 4
; P9LE-NEXT: mulli r4, r4, 23
; P9LE-NEXT: subf r3, r4, r3
; P9LE-NEXT: mtfprd f0, r3
; P9LE-NEXT: li r3, 6
; P9LE-NEXT: vextuhrx r3, r3, v2
-; P9LE-NEXT: rlwinm r4, r3, 0, 16, 31
-; P9LE-NEXT: mulld r4, r4, r6
-; P9LE-NEXT: rldicl r4, r4, 21, 43
+; P9LE-NEXT: clrlwi r4, r3, 16
+; P9LE-NEXT: mulhwu r4, r4, r5
+; P9LE-NEXT: lis r5, -14230
+; P9LE-NEXT: ori r5, r5, 30865
+; P9LE-NEXT: srwi r4, r4, 11
; P9LE-NEXT: mulli r4, r4, 5423
; P9LE-NEXT: subf r3, r4, r3
; P9LE-NEXT: xxswapd v3, vs0
@@ -1000,8 +909,8 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P9LE-NEXT: li r3, 2
; P9LE-NEXT: vextuhrx r3, r3, v2
; P9LE-NEXT: rlwinm r4, r3, 31, 17, 31
-; P9LE-NEXT: mulld r4, r4, r5
-; P9LE-NEXT: rldicl r4, r4, 24, 40
+; P9LE-NEXT: mulhwu r4, r4, r5
+; P9LE-NEXT: srwi r4, r4, 8
; P9LE-NEXT: mulli r4, r4, 654
; P9LE-NEXT: subf r3, r4, r3
; P9LE-NEXT: xxswapd v4, vs0
@@ -1017,44 +926,41 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P9BE: # %bb.0:
; P9BE-NEXT: li r3, 6
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: lis r4, 24749
+; P9BE-NEXT: ori r4, r4, 47143
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: lis r5, 24749
-; P9BE-NEXT: ori r5, r5, 47143
-; P9BE-NEXT: clrldi r4, r3, 32
-; P9BE-NEXT: mulld r4, r4, r5
-; P9BE-NEXT: li r5, 0
-; P9BE-NEXT: oris r6, r5, 45590
-; P9BE-NEXT: oris r5, r5, 51306
-; P9BE-NEXT: ori r6, r6, 17097
-; P9BE-NEXT: ori r5, r5, 30865
-; P9BE-NEXT: rldicl r4, r4, 21, 43
+; P9BE-NEXT: mulhwu r4, r3, r4
+; P9BE-NEXT: srwi r4, r4, 11
; P9BE-NEXT: mulli r4, r4, 5423
; P9BE-NEXT: subf r3, r4, r3
+; P9BE-NEXT: lis r4, -19946
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v3, r3
; P9BE-NEXT: li r3, 4
; P9BE-NEXT: vextuhlx r3, r3, v2
; P9BE-NEXT: clrlwi r3, r3, 16
-; P9BE-NEXT: clrldi r4, r3, 32
-; P9BE-NEXT: mulld r4, r4, r6
-; P9BE-NEXT: rldicl r4, r4, 28, 36
+; P9BE-NEXT: ori r4, r4, 17097
+; P9BE-NEXT: mulhwu r4, r3, r4
+; P9BE-NEXT: srwi r4, r4, 4
; P9BE-NEXT: mulli r4, r4, 23
; P9BE-NEXT: subf r3, r4, r3
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: li r3, 2
; P9BE-NEXT: vextuhlx r3, r3, v2
+; P9BE-NEXT: lis r5, -14230
+; P9BE-NEXT: ori r5, r5, 30865
+; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: clrlwi r4, r3, 16
; P9BE-NEXT: rlwinm r3, r3, 31, 17, 31
-; P9BE-NEXT: mulld r3, r3, r5
-; P9BE-NEXT: rldicl r3, r3, 24, 40
+; P9BE-NEXT: mulhwu r3, r3, r5
+; P9BE-NEXT: srwi r3, r3, 8
; P9BE-NEXT: mulli r3, r3, 654
; P9BE-NEXT: subf r3, r3, r4
; P9BE-NEXT: sldi r3, r3, 48
; P9BE-NEXT: mtvsrd v2, r3
; P9BE-NEXT: li r3, 0
; P9BE-NEXT: sldi r3, r3, 48
-; P9BE-NEXT: vmrghh v3, v4, v3
; P9BE-NEXT: mtvsrd v4, r3
; P9BE-NEXT: vmrghh v2, v4, v2
; P9BE-NEXT: vmrghw v2, v2, v3
@@ -1063,35 +969,34 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P8LE-LABEL: dont_fold_urem_one:
; P8LE: # %bb.0:
; P8LE-NEXT: xxswapd vs0, v2
-; P8LE-NEXT: li r3, 0
-; P8LE-NEXT: lis r8, 24749
+; P8LE-NEXT: lis r3, -19946
+; P8LE-NEXT: lis r7, 24749
+; P8LE-NEXT: lis r9, -14230
; P8LE-NEXT: xxlxor v5, v5, v5
-; P8LE-NEXT: oris r5, r3, 45590
-; P8LE-NEXT: ori r8, r8, 47143
-; P8LE-NEXT: oris r3, r3, 51306
-; P8LE-NEXT: ori r5, r5, 17097
-; P8LE-NEXT: ori r3, r3, 30865
+; P8LE-NEXT: ori r3, r3, 17097
+; P8LE-NEXT: ori r7, r7, 47143
+; P8LE-NEXT: ori r9, r9, 30865
; P8LE-NEXT: mffprd r4, f0
-; P8LE-NEXT: rldicl r6, r4, 32, 48
-; P8LE-NEXT: rldicl r7, r4, 16, 48
-; P8LE-NEXT: rlwinm r9, r6, 0, 16, 31
+; P8LE-NEXT: rldicl r5, r4, 32, 48
+; P8LE-NEXT: rldicl r6, r4, 16, 48
+; P8LE-NEXT: clrlwi r8, r5, 16
; P8LE-NEXT: rldicl r4, r4, 48, 48
-; P8LE-NEXT: mulld r5, r9, r5
-; P8LE-NEXT: rlwinm r9, r7, 0, 16, 31
-; P8LE-NEXT: mulld r8, r9, r8
-; P8LE-NEXT: rlwinm r9, r4, 31, 17, 31
-; P8LE-NEXT: mulld r3, r9, r3
-; P8LE-NEXT: rldicl r5, r5, 28, 36
-; P8LE-NEXT: rldicl r8, r8, 21, 43
-; P8LE-NEXT: mulli r5, r5, 23
-; P8LE-NEXT: rldicl r3, r3, 24, 40
-; P8LE-NEXT: mulli r8, r8, 5423
-; P8LE-NEXT: mulli r3, r3, 654
-; P8LE-NEXT: subf r5, r5, r6
-; P8LE-NEXT: subf r6, r8, r7
-; P8LE-NEXT: mtfprd f0, r5
-; P8LE-NEXT: subf r3, r3, r4
-; P8LE-NEXT: mtfprd f1, r6
+; P8LE-NEXT: mulhwu r3, r8, r3
+; P8LE-NEXT: clrlwi r8, r6, 16
+; P8LE-NEXT: mulhwu r7, r8, r7
+; P8LE-NEXT: rlwinm r8, r4, 31, 17, 31
+; P8LE-NEXT: mulhwu r8, r8, r9
+; P8LE-NEXT: srwi r3, r3, 4
+; P8LE-NEXT: srwi r7, r7, 11
+; P8LE-NEXT: mulli r3, r3, 23
+; P8LE-NEXT: srwi r8, r8, 8
+; P8LE-NEXT: mulli r7, r7, 5423
+; P8LE-NEXT: mulli r8, r8, 654
+; P8LE-NEXT: subf r3, r3, r5
+; P8LE-NEXT: subf r5, r7, r6
+; P8LE-NEXT: mtfprd f0, r3
+; P8LE-NEXT: subf r3, r8, r4
+; P8LE-NEXT: mtfprd f1, r5
; P8LE-NEXT: mtfprd f2, r3
; P8LE-NEXT: xxswapd v2, vs0
; P8LE-NEXT: xxswapd v3, vs1
@@ -1104,45 +1009,42 @@ define <4 x i16> @dont_fold_urem_one(<4 x i16> %x) {
; P8BE-LABEL: dont_fold_urem_one:
; P8BE: # %bb.0:
; P8BE-NEXT: mfvsrd r4, v2
-; P8BE-NEXT: li r3, 0
-; P8BE-NEXT: lis r8, 24749
-; P8BE-NEXT: oris r6, r3, 51306
-; P8BE-NEXT: ori r8, r8, 47143
-; P8BE-NEXT: oris r3, r3, 45590
-; P8BE-NEXT: rldicl r5, r4, 32, 48
-; P8BE-NEXT: clrldi r7, r4, 48
-; P8BE-NEXT: ori r6, r6, 30865
-; P8BE-NEXT: ori r3, r3, 17097
-; P8BE-NEXT: rldicl r4, r4, 48, 48
-; P8BE-NEXT: rlwinm r9, r5, 31, 17, 31
-; P8BE-NEXT: clrlwi r7, r7, 16
+; P8BE-NEXT: lis r3, 24749
+; P8BE-NEXT: lis r7, -19946
+; P8BE-NEXT: lis r8, -14230
+; P8BE-NEXT: ori r3, r3, 47143
+; P8BE-NEXT: ori r7, r7, 17097
+; P8BE-NEXT: ori r8, r8, 30865
+; P8BE-NEXT: clrldi r5, r4, 48
+; P8BE-NEXT: rldicl r6, r4, 48, 48
+; P8BE-NEXT: rldicl r4, r4, 32, 48
; P8BE-NEXT: clrlwi r5, r5, 16
+; P8BE-NEXT: clrlwi r6, r6, 16
+; P8BE-NEXT: mulhwu r3, r5, r3
+; P8BE-NEXT: rlwinm r9, r4, 31, 17, 31
; P8BE-NEXT: clrlwi r4, r4, 16
-; P8BE-NEXT: mulld r6, r9, r6
-; P8BE-NEXT: clrldi r9, r7, 32
-; P8BE-NEXT: mulld r8, r9, r8
-; P8BE-NEXT: clrldi r9, r4, 32
-; P8BE-NEXT: mulld r3, r9, r3
+; P8BE-NEXT: mulhwu r7, r6, r7
+; P8BE-NEXT: mulhwu r8, r9, r8
; P8BE-NEXT: li r9, 0
-; P8BE-NEXT: rldicl r6, r6, 24, 40
-; P8BE-NEXT: mulli r6, r6, 654
-; P8BE-NEXT: rldicl r8, r8, 21, 43
-; P8BE-NEXT: rldicl r3, r3, 28, 36
-; P8BE-NEXT: mulli r8, r8, 5423
-; P8BE-NEXT: mulli r3, r3, 23
-; P8BE-NEXT: subf r5, r6, r5
-; P8BE-NEXT: sldi r6, r9, 48
-; P8BE-NEXT: mtvsrd v2, r6
-; P8BE-NEXT: sldi r5, r5, 48
-; P8BE-NEXT: subf r6, r8, r7
-; P8BE-NEXT: mtvsrd v3, r5
-; P8BE-NEXT: subf r3, r3, r4
-; P8BE-NEXT: sldi r4, r6, 48
+; P8BE-NEXT: srwi r3, r3, 11
+; P8BE-NEXT: srwi r7, r7, 4
+; P8BE-NEXT: mulli r3, r3, 5423
+; P8BE-NEXT: srwi r8, r8, 8
+; P8BE-NEXT: mulli r7, r7, 23
+; P8BE-NEXT: mulli r8, r8, 654
+; P8BE-NEXT: subf r3, r3, r5
+; P8BE-NEXT: sldi r5, r9, 48
+; P8BE-NEXT: mtvsrd v2, r5
+; P8BE-NEXT: subf r5, r7, r6
; P8BE-NEXT: sldi r3, r3, 48
-; P8BE-NEXT: mtvsrd v4, r4
+; P8BE-NEXT: subf r4, r8, r4
+; P8BE-NEXT: sldi r5, r5, 48
+; P8BE-NEXT: mtvsrd v3, r3
+; P8BE-NEXT: sldi r3, r4, 48
+; P8BE-NEXT: mtvsrd v4, r5
; P8BE-NEXT: mtvsrd v5, r3
-; P8BE-NEXT: vmrghh v2, v2, v3
-; P8BE-NEXT: vmrghh v3, v5, v4
+; P8BE-NEXT: vmrghh v3, v4, v3
+; P8BE-NEXT: vmrghh v2, v2, v5
; P8BE-NEXT: vmrghw v2, v2, v3
; P8BE-NEXT: blr
%1 = urem <4 x i16> %x, <i16 1, i16 654, i16 23, i16 5423>
More information about the llvm-commits
mailing list