[llvm] [SelectionDAG] Use Karatsuba decomposition to expand vector CLMUL via narrower legal types (PR #184468)

via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 4 07:32:56 PST 2026


https://github.com/AbdallahRashed updated https://github.com/llvm/llvm-project/pull/184468

>From ef87971fdd2450a138d5a07e7246c40ca65ceb89 Mon Sep 17 00:00:00 2001
From: AbdallahRashed <abdallah.mrashed at gmail.com>
Date: Sat, 28 Feb 2026 22:57:00 +0100
Subject: [PATCH] [SelectionDAG] Use halving decomposition to expand vector
 CLMUL via narrower legal types

Reuse the ExpandIntRes_CLMUL identity to expand vector
CLMUL/CLMULR/CLMULH on wider element types (vXi16, vXi32, vXi64) by
decomposing into half-element-width operations that eventually reach a
legal CLMUL type.

Three generic strategies in expandCLMUL:
1. Halve: halve element width (e.g. v8i16 -> v8i8 on AArch64)
2. Element widen: zext to wider type if CLMUL is legal there (e.g. x86)
3. Count widen: pad with undef to double element count (e.g. v4i16 -> v8i16)

A helper canNarrowCLMULToLegal() guides strategy selection and prevents
circular expansion in the CLMULH bitreverse path.

Also add Custom BITREVERSE lowering for v4i16/v8i16 on AArch64 using
REV16+RBIT, which the CLMULH expansion relies on.

Fixes #183768
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  157 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   16 +
 llvm/test/CodeGen/AArch64/clmul-fixed.ll      | 2533 ++++-----
 llvm/test/CodeGen/PowerPC/clmul-vector.ll     | 4556 ++++++++---------
 llvm/test/CodeGen/X86/clmul-vector.ll         |  155 +-
 5 files changed, 3316 insertions(+), 4101 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 088e6726fea58..a4bb227fed213 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8456,6 +8456,49 @@ SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
   return DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal);
 }
 
+/// Check if CLMUL on VT can eventually reach a type with legal CLMUL through
+/// a chain of halving decompositions (halving element width) and/or vector
+/// widening (doubling element count). This guides expansion strategy selection:
+/// if true, the halving/widening path produces better code than bit-by-bit.
+///
+/// HalveDepth tracks halving steps only (each creates ~4x more operations).
+/// Widening steps are cheap (O(1) pad/extract) and don't count.
+/// Limiting halvings to 2 prevents exponential blowup:
+///   1 halving: ~4 sub-CLMULs (good, e.g. v8i16 -> v8i8)
+///   2 halvings: ~16 sub-CLMULs (acceptable, e.g. v4i32 -> v4i16 -> v8i8)
+///   3 halvings: ~64 sub-CLMULs (worse than bit-by-bit expansion)
+static bool canNarrowCLMULToLegal(const TargetLowering &TLI, LLVMContext &Ctx,
+                                  EVT VT, unsigned HalveDepth = 0,
+                                  unsigned TotalDepth = 0) {
+  if (HalveDepth > 2 || TotalDepth > 8 || !VT.isFixedLengthVector())
+    return false;
+  if (TLI.isOperationLegalOrCustom(ISD::CLMUL, VT))
+    return true;
+  if (!TLI.isTypeLegal(VT))
+    return false;
+
+  unsigned BW = VT.getScalarSizeInBits();
+
+  // Halve: halve element width, same element count.
+  // This is the expensive step -- each halving creates ~4x more operations.
+  if (BW >= 16) {
+    EVT HalfEltVT = EVT::getIntegerVT(Ctx, BW / 2);
+    EVT HalfVT = VT.changeVectorElementType(Ctx, HalfEltVT);
+    if (TLI.isTypeLegal(HalfVT) &&
+        canNarrowCLMULToLegal(TLI, Ctx, HalfVT, HalveDepth + 1, TotalDepth + 1))
+      return true;
+  }
+
+  // Widen: double element count (fixed-width vectors only).
+  // This is cheap -- just INSERT_SUBVECTOR + EXTRACT_SUBVECTOR.
+  EVT WideVT = VT.getDoubleNumVectorElementsVT(Ctx);
+  if (TLI.isTypeLegal(WideVT) &&
+      canNarrowCLMULToLegal(TLI, Ctx, WideVT, HalveDepth, TotalDepth + 1))
+    return true;
+
+  return false;
+}
+
 SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
@@ -8463,15 +8506,103 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
   SDValue Y = Node->getOperand(1);
   unsigned BW = VT.getScalarSizeInBits();
   unsigned Opcode = Node->getOpcode();
+  LLVMContext &Ctx = *DAG.getContext();
 
   switch (Opcode) {
   case ISD::CLMUL: {
+    // For vector types, try decomposition strategies that leverage legal
+    // CLMUL on narrower or wider element types, avoiding the expensive
+    // bit-by-bit expansion.
+    if (VT.isVector()) {
+      // Strategy 1: Halving decomposition to half-element-width CLMUL.
+      // Applies ExpandIntRes_CLMUL's identity element-wise:
+      //   CLMUL(X, Y) = (Hi << HalfBW) | Lo
+      // where:
+      //   Lo = CLMUL(XLo, YLo)
+      //   Hi = CLMULH(XLo, YLo) ^ CLMUL(XLo, YHi) ^ CLMUL(XHi, YLo)
+      unsigned HalfBW = BW / 2;
+      if (HalfBW >= 8) {
+        EVT HalfEltVT = EVT::getIntegerVT(Ctx, HalfBW);
+        EVT HalfVT =
+            EVT::getVectorVT(Ctx, HalfEltVT, VT.getVectorElementCount());
+        if (isTypeLegal(HalfVT) && canNarrowCLMULToLegal(*this, Ctx, HalfVT,
+                                                         /*HalveDepth=*/1)) {
+          SDValue ShAmt = DAG.getShiftAmountConstant(HalfBW, VT, DL);
+
+          // Extract low and high halves of each element.
+          SDValue XLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, X);
+          SDValue XHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT,
+                                    DAG.getNode(ISD::SRL, DL, VT, X, ShAmt));
+          SDValue YLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Y);
+          SDValue YHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT,
+                                    DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt));
+
+          // Lo = CLMUL(XLo, YLo)
+          SDValue Lo = DAG.getNode(ISD::CLMUL, DL, HalfVT, XLo, YLo);
+
+          // Hi = CLMULH(XLo, YLo) ^ CLMUL(XLo, YHi) ^ CLMUL(XHi, YLo)
+          SDValue LoH = DAG.getNode(ISD::CLMULH, DL, HalfVT, XLo, YLo);
+          SDValue Cross1 = DAG.getNode(ISD::CLMUL, DL, HalfVT, XLo, YHi);
+          SDValue Cross2 = DAG.getNode(ISD::CLMUL, DL, HalfVT, XHi, YLo);
+          SDValue Cross = DAG.getNode(ISD::XOR, DL, HalfVT, Cross1, Cross2);
+          SDValue Hi = DAG.getNode(ISD::XOR, DL, HalfVT, LoH, Cross);
+
+          // Reassemble: Result = ZExt(Lo) | (ZExt(Hi) << HalfBW)
+          SDValue LoExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo);
+          SDValue HiExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi);
+          SDValue HiShifted = DAG.getNode(ISD::SHL, DL, VT, HiExt, ShAmt);
+          return DAG.getNode(ISD::OR, DL, VT, LoExt, HiShifted);
+        }
+      }
+
+      // Strategy 2: Widen to double-element-width CLMUL.
+      // CLMUL(X, Y) = Trunc(CLMUL(ZExt(X), ZExt(Y)))
+      {
+        EVT ExtVT = VT.changeElementType(Ctx, EVT::getIntegerVT(Ctx, 2 * BW));
+        if (isTypeLegal(ExtVT) && isOperationLegalOrCustom(ISD::CLMUL, ExtVT) &&
+            isOperationLegalOrCustom(ISD::ZERO_EXTEND, ExtVT)) {
+          // If CLMUL on ExtVT is Custom (not Legal), the target may
+          // scalarize it, costing O(NumElements) scalar ops. The bit-by-bit
+          // fallback costs O(BW) vectorized iterations. Only widen when
+          // element count is small enough that scalarization is cheaper.
+          unsigned NumElts = VT.getVectorMinNumElements();
+          if (isOperationLegal(ISD::CLMUL, ExtVT) || NumElts < BW) {
+            SDValue XExt = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, X);
+            SDValue YExt = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Y);
+            SDValue Mul = DAG.getNode(ISD::CLMUL, DL, ExtVT, XExt, YExt);
+            return DAG.getNode(ISD::TRUNCATE, DL, VT, Mul);
+          }
+        }
+      }
+
+      // Strategy 3: Widen element count (pad with undef, do CLMUL on wider
+      // vector, extract lower result). CLMUL is element-wise, so upper
+      // (undef) lanes don't affect the lower results.
+      // e.g. v4i16 => pad to v8i16 => halve to v8i8 PMUL => extract v4i16.
+      if (auto EC = VT.getVectorElementCount(); EC.isFixed()) {
+        EVT WideVT = EVT::getVectorVT(Ctx, VT.getVectorElementType(), EC * 2);
+        if (isTypeLegal(WideVT) && canNarrowCLMULToLegal(*this, Ctx, WideVT)) {
+          SDValue Undef = DAG.getUNDEF(WideVT);
+          SDValue XWide = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, Undef,
+                                      X, DAG.getVectorIdxConstant(0, DL));
+          SDValue YWide = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, Undef,
+                                      Y, DAG.getVectorIdxConstant(0, DL));
+          SDValue WideRes = DAG.getNode(ISD::CLMUL, DL, WideVT, XWide, YWide);
+          return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WideRes,
+                             DAG.getVectorIdxConstant(0, DL));
+        }
+      }
+    }
+
+    // Scalarize if the vector multiplication is unlikely to work.
+    if (VT.isVector() && !isOperationLegalOrCustom(ISD::MUL, VT))
+      return DAG.UnrollVectorOp(Node);
+
     // NOTE: If you change this expansion, please update the cost model
     // calculation in BasicTTIImpl::getTypeBasedIntrinsicInstrCost for
     // Intrinsic::clmul.
 
-    EVT SetCCVT =
-        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), Ctx, VT);
 
     SDValue Res = DAG.getConstant(0, DL, VT);
     for (unsigned I = 0; I < BW; ++I) {
@@ -8484,8 +8615,7 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
       // instructions.
       SDValue Part;
       if (!hasBitTest(Y, ShiftAmt) &&
-          isOperationLegalOrCustom(
-              ISD::MUL, getTypeToTransformTo(*DAG.getContext(), VT))) {
+          isOperationLegalOrCustom(ISD::MUL, getTypeToTransformTo(Ctx, VT))) {
         Part = DAG.getNode(ISD::MUL, DL, VT, X, YMasked);
       } else {
         // Canonical bit test: (Y & (1 << I)) != 0
@@ -8512,17 +8642,20 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
     }
     [[fallthrough]];
   case ISD::CLMULH: {
-    EVT ExtVT = VT.changeElementType(
-        *DAG.getContext(), EVT::getIntegerVT(*DAG.getContext(), 2 * BW));
-    // For example, ExtVT = i64 based operations aren't legal on a 32-bit
-    // target; use bitreverse-based lowering in this case.
-    // Also prefer bitreverse-based lowering when CLMUL is legal on VT but
-    // not on ExtVT, to avoid expanding CLMUL on the wider type (e.g. v8i8
-    // on AArch64 where CLMUL v8i8 is legal via PMUL but CLMUL v8i16 is not).
+    EVT ExtVT = VT.changeElementType(Ctx, EVT::getIntegerVT(Ctx, 2 * BW));
+    // Use bitreverse-based lowering (CLMULR/H = rev(CLMUL(rev,rev)) >> S)
+    // when any of these hold:
+    // (a) ZERO_EXTEND to ExtVT or SRL on ExtVT isn't legal.
+    // (b) CLMUL is legal on VT but not on ExtVT (e.g. v8i8 on AArch64).
+    // (c) CLMUL on VT can be efficiently expanded via halving/widening
+    //     to reach legal CLMUL. The bitreverse path creates CLMUL(VT) which
+    //     will be expanded efficiently. The widening path would create
+    //     CLMUL(ExtVT) => halving => CLMULH(VT), causing a cycle.
     if (!isOperationLegalOrCustom(ISD::ZERO_EXTEND, ExtVT) ||
         !isOperationLegalOrCustom(ISD::SRL, ExtVT) ||
         (!isOperationLegalOrCustom(ISD::CLMUL, ExtVT) &&
-         isOperationLegalOrCustom(ISD::CLMUL, VT))) {
+         isOperationLegalOrCustom(ISD::CLMUL, VT)) ||
+        canNarrowCLMULToLegal(*this, Ctx, VT)) {
       SDValue XRev = DAG.getNode(ISD::BITREVERSE, DL, VT, X);
       SDValue YRev = DAG.getNode(ISD::BITREVERSE, DL, VT, Y);
       SDValue ClMul = DAG.getNode(ISD::CLMUL, DL, VT, XRev, YRev);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2cd78493d2c23..b7d186d83c92e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1329,6 +1329,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::CTLS, VT, Legal);
     setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
     setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
+    setOperationAction(ISD::BITREVERSE, MVT::v4i16, Custom);
+    setOperationAction(ISD::BITREVERSE, MVT::v8i16, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
@@ -11960,6 +11962,20 @@ SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
   default:
     llvm_unreachable("Invalid type for bitreverse!");
 
+  case MVT::v4i16: {
+    VST = MVT::v8i8;
+    REVB = DAG.getNode(AArch64ISD::REV16, DL, VST, Op.getOperand(0));
+
+    break;
+  }
+
+  case MVT::v8i16: {
+    VST = MVT::v16i8;
+    REVB = DAG.getNode(AArch64ISD::REV16, DL, VST, Op.getOperand(0));
+
+    break;
+  }
+
   case MVT::v2i32: {
     VST = MVT::v8i8;
     REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
index 23692dc456fc2..46ad7d9bbc295 100644
--- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
@@ -23,69 +23,23 @@ define <8 x i8> @clmul_v8i8_neon(<8 x i8> %x, <8 x i8> %y) {
 define <8 x i16> @clmul_v8i16_neon(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: clmul_v8i16_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.8h, #2
-; CHECK-NEXT:    movi v3.8h, #1
-; CHECK-NEXT:    movi v4.8h, #4
-; CHECK-NEXT:    movi v5.8h, #8
-; CHECK-NEXT:    movi v6.8h, #16
-; CHECK-NEXT:    movi v7.8h, #32
-; CHECK-NEXT:    movi v16.8h, #128
-; CHECK-NEXT:    movi v17.8h, #1, lsl #8
-; CHECK-NEXT:    movi v18.8h, #8, lsl #8
-; CHECK-NEXT:    movi v19.8h, #16, lsl #8
-; CHECK-NEXT:    movi v20.8h, #64
-; CHECK-NEXT:    movi v21.8h, #2, lsl #8
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    movi v22.8h, #32, lsl #8
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    mul v2.8h, v0.8h, v2.8h
-; CHECK-NEXT:    mul v3.8h, v0.8h, v3.8h
-; CHECK-NEXT:    mul v4.8h, v0.8h, v4.8h
-; CHECK-NEXT:    mul v5.8h, v0.8h, v5.8h
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    movi v23.8h, #4, lsl #8
-; CHECK-NEXT:    movi v24.8h, #64, lsl #8
-; CHECK-NEXT:    mul v6.8h, v0.8h, v6.8h
-; CHECK-NEXT:    mul v7.8h, v0.8h, v7.8h
-; CHECK-NEXT:    mul v16.8h, v0.8h, v16.8h
-; CHECK-NEXT:    mul v17.8h, v0.8h, v17.8h
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    mul v18.8h, v0.8h, v18.8h
-; CHECK-NEXT:    mul v19.8h, v0.8h, v19.8h
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    mul v4.8h, v0.8h, v20.8h
-; CHECK-NEXT:    movi v20.8h, #128, lsl #8
-; CHECK-NEXT:    mul v5.8h, v0.8h, v21.8h
-; CHECK-NEXT:    and v21.16b, v1.16b, v23.16b
-; CHECK-NEXT:    and v23.16b, v1.16b, v24.16b
-; CHECK-NEXT:    mul v22.8h, v0.8h, v22.8h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v20.16b
-; CHECK-NEXT:    mul v3.8h, v0.8h, v21.8h
-; CHECK-NEXT:    mul v17.8h, v0.8h, v23.8h
-; CHECK-NEXT:    eor v4.16b, v6.16b, v4.16b
-; CHECK-NEXT:    eor v5.16b, v7.16b, v5.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v22.16b
-; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v4.16b
-; CHECK-NEXT:    eor v2.16b, v5.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v6.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
   %a = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %a
@@ -94,69 +48,26 @@ define <8 x i16> @clmul_v8i16_neon(<8 x i16> %x, <8 x i16> %y) {
 define <4 x i16> @clmul_v4i16_neon(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: clmul_v4i16_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4h, #2
-; CHECK-NEXT:    movi v3.4h, #1
-; CHECK-NEXT:    movi v4.4h, #4
-; CHECK-NEXT:    movi v5.4h, #8
-; CHECK-NEXT:    movi v6.4h, #16
-; CHECK-NEXT:    movi v7.4h, #32
-; CHECK-NEXT:    movi v16.4h, #128
-; CHECK-NEXT:    movi v17.4h, #1, lsl #8
-; CHECK-NEXT:    movi v18.4h, #8, lsl #8
-; CHECK-NEXT:    movi v19.4h, #16, lsl #8
-; CHECK-NEXT:    movi v20.4h, #64
-; CHECK-NEXT:    movi v21.4h, #2, lsl #8
-; CHECK-NEXT:    and v2.8b, v1.8b, v2.8b
-; CHECK-NEXT:    and v3.8b, v1.8b, v3.8b
-; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT:    and v5.8b, v1.8b, v5.8b
-; CHECK-NEXT:    movi v22.4h, #32, lsl #8
-; CHECK-NEXT:    and v6.8b, v1.8b, v6.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v7.8b
-; CHECK-NEXT:    and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT:    and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT:    and v18.8b, v1.8b, v18.8b
-; CHECK-NEXT:    and v19.8b, v1.8b, v19.8b
-; CHECK-NEXT:    mul v2.4h, v0.4h, v2.4h
-; CHECK-NEXT:    mul v3.4h, v0.4h, v3.4h
-; CHECK-NEXT:    mul v4.4h, v0.4h, v4.4h
-; CHECK-NEXT:    mul v5.4h, v0.4h, v5.4h
-; CHECK-NEXT:    and v20.8b, v1.8b, v20.8b
-; CHECK-NEXT:    movi v23.4h, #4, lsl #8
-; CHECK-NEXT:    movi v24.4h, #64, lsl #8
-; CHECK-NEXT:    mul v6.4h, v0.4h, v6.4h
-; CHECK-NEXT:    mul v7.4h, v0.4h, v7.4h
-; CHECK-NEXT:    mul v16.4h, v0.4h, v16.4h
-; CHECK-NEXT:    mul v17.4h, v0.4h, v17.4h
-; CHECK-NEXT:    and v21.8b, v1.8b, v21.8b
-; CHECK-NEXT:    mul v18.4h, v0.4h, v18.4h
-; CHECK-NEXT:    mul v19.4h, v0.4h, v19.4h
-; CHECK-NEXT:    and v22.8b, v1.8b, v22.8b
-; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
-; CHECK-NEXT:    eor v3.8b, v4.8b, v5.8b
-; CHECK-NEXT:    mul v4.4h, v0.4h, v20.4h
-; CHECK-NEXT:    movi v20.4h, #128, lsl #8
-; CHECK-NEXT:    mul v5.4h, v0.4h, v21.4h
-; CHECK-NEXT:    and v21.8b, v1.8b, v23.8b
-; CHECK-NEXT:    and v23.8b, v1.8b, v24.8b
-; CHECK-NEXT:    mul v22.4h, v0.4h, v22.4h
-; CHECK-NEXT:    eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT:    eor v7.8b, v16.8b, v17.8b
-; CHECK-NEXT:    eor v16.8b, v18.8b, v19.8b
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v20.8b
-; CHECK-NEXT:    mul v3.4h, v0.4h, v21.4h
-; CHECK-NEXT:    mul v17.4h, v0.4h, v23.4h
-; CHECK-NEXT:    eor v4.8b, v6.8b, v4.8b
-; CHECK-NEXT:    eor v5.8b, v7.8b, v5.8b
-; CHECK-NEXT:    eor v6.8b, v16.8b, v22.8b
-; CHECK-NEXT:    mul v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    eor v1.8b, v2.8b, v4.8b
-; CHECK-NEXT:    eor v2.8b, v5.8b, v3.8b
-; CHECK-NEXT:    eor v3.8b, v6.8b, v17.8b
-; CHECK-NEXT:    eor v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v3.8b, v0.8b
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
 ; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %a = call <4 x i16> @llvm.clmul.v4i16(<4 x i16> %x, <4 x i16> %y)
   ret <4 x i16> %a
@@ -165,269 +76,184 @@ define <4 x i16> @clmul_v4i16_neon(<4 x i16> %x, <4 x i16> %y) {
 define <4 x i32> @clmul_v4i32_neon(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: clmul_v4i32_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #2
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    movi v4.4s, #4
-; CHECK-NEXT:    movi v5.4s, #8
-; CHECK-NEXT:    movi v6.4s, #16
-; CHECK-NEXT:    movi v7.4s, #32
-; CHECK-NEXT:    movi v16.4s, #64
-; CHECK-NEXT:    movi v17.4s, #128
-; CHECK-NEXT:    movi v18.4s, #1, lsl #8
-; CHECK-NEXT:    movi v19.4s, #2, lsl #8
-; CHECK-NEXT:    movi v20.4s, #8, lsl #8
-; CHECK-NEXT:    movi v21.4s, #128, lsl #16
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mul v5.4s, v0.4s, v5.4s
-; CHECK-NEXT:    mul v6.4s, v0.4s, v6.4s
-; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    movi v22.4s, #8, lsl #16
-; CHECK-NEXT:    movi v23.4s, #2, lsl #24
-; CHECK-NEXT:    movi v25.4s, #4, lsl #24
-; CHECK-NEXT:    movi v24.4s, #32, lsl #16
-; CHECK-NEXT:    movi v26.4s, #8, lsl #24
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    movi v4.4s, #16, lsl #8
-; CHECK-NEXT:    mul v5.4s, v0.4s, v16.4s
-; CHECK-NEXT:    mul v16.4s, v0.4s, v17.4s
-; CHECK-NEXT:    mul v17.4s, v0.4s, v18.4s
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v19.16b
-; CHECK-NEXT:    movi v19.4s, #32, lsl #8
-; CHECK-NEXT:    and v18.16b, v1.16b, v20.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    movi v20.4s, #64, lsl #8
-; CHECK-NEXT:    mul v21.4s, v0.4s, v21.4s
-; CHECK-NEXT:    and v3.16b, v1.16b, v4.16b
-; CHECK-NEXT:    eor v5.16b, v6.16b, v5.16b
-; CHECK-NEXT:    movi v4.4s, #1, lsl #16
-; CHECK-NEXT:    eor v6.16b, v16.16b, v17.16b
-; CHECK-NEXT:    movi v16.4s, #2, lsl #16
-; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT:    mul v18.4s, v0.4s, v18.4s
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    movi v17.4s, #4, lsl #8
-; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v16.16b
-; CHECK-NEXT:    movi v16.4s, #64, lsl #16
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    mul v7.4s, v0.4s, v19.4s
-; CHECK-NEXT:    movi v19.4s, #4, lsl #16
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    eor v3.16b, v18.16b, v3.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v20.16b
-; CHECK-NEXT:    movi v20.4s, #1, lsl #24
-; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mul v5.4s, v0.4s, v5.4s
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    mul v17.4s, v0.4s, v17.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v19.16b
-; CHECK-NEXT:    mul v18.4s, v0.4s, v18.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    movi v19.4s, #128, lsl #8
-; CHECK-NEXT:    mul v16.4s, v0.4s, v16.4s
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    mul v5.4s, v0.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v1.16b, v22.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #16
-; CHECK-NEXT:    mul v20.4s, v0.4s, v20.4s
-; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v18.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v19.16b
-; CHECK-NEXT:    mul v18.4s, v0.4s, v23.4s
-; CHECK-NEXT:    and v19.16b, v1.16b, v25.16b
-; CHECK-NEXT:    eor v16.16b, v16.16b, v21.16b
-; CHECK-NEXT:    and v21.16b, v1.16b, v24.16b
-; CHECK-NEXT:    movi v23.4s, #32, lsl #24
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    mul v5.4s, v0.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v1.16b, v22.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #24
-; CHECK-NEXT:    movi v24.4s, #64, lsl #24
-; CHECK-NEXT:    mul v17.4s, v0.4s, v17.4s
-; CHECK-NEXT:    eor v16.16b, v16.16b, v20.16b
-; CHECK-NEXT:    and v20.16b, v1.16b, v26.16b
-; CHECK-NEXT:    mul v19.4s, v0.4s, v19.4s
-; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    mul v6.4s, v0.4s, v21.4s
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v21.16b, v1.16b, v23.16b
-; CHECK-NEXT:    eor v5.16b, v16.16b, v18.16b
-; CHECK-NEXT:    movi v16.4s, #128, lsl #24
-; CHECK-NEXT:    mul v18.4s, v0.4s, v20.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v22.16b
-; CHECK-NEXT:    and v22.16b, v1.16b, v24.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v17.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v7.16b
-; CHECK-NEXT:    eor v5.16b, v5.16b, v19.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v16.16b
-; CHECK-NEXT:    mul v7.4s, v0.4s, v20.4s
-; CHECK-NEXT:    mul v16.4s, v0.4s, v21.4s
-; CHECK-NEXT:    mul v17.4s, v0.4s, v22.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v6.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v18.16b
-; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    xtn v2.4h, v1.4s
+; CHECK-NEXT:    xtn v3.4h, v0.4s
+; CHECK-NEXT:    shrn v16.4h, v0.4s, #16
+; CHECK-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEXT:    rev16 v4.8b, v2.8b
+; CHECK-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEXT:    xtn v1.8b, v3.8h
+; CHECK-NEXT:    shrn v2.8b, v2.8h, #8
+; CHECK-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEXT:    rbit v22.8b, v0.8b
+; CHECK-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEXT:    rbit v24.8b, v1.8b
+; CHECK-NEXT:    pmul v16.8b, v16.8b, v0.8b
+; CHECK-NEXT:    pmul v25.8b, v20.8b, v2.8b
+; CHECK-NEXT:    pmul v17.8b, v1.8b, v17.8b
+; CHECK-NEXT:    pmul v2.8b, v1.8b, v2.8b
+; CHECK-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEXT:    pmul v3.8b, v3.8b, v0.8b
+; CHECK-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEXT:    pmul v18.8b, v20.8b, v0.8b
+; CHECK-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEXT:    pmul v16.8b, v1.8b, v21.8b
+; CHECK-NEXT:    pmul v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    eor v1.8b, v3.8b, v2.8b
+; CHECK-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEXT:    shll v1.8h, v1.8b, #8
+; CHECK-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %a = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %a
 }
 
 define <2 x i32> @clmul_v2i32_neon(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: clmul_v2i32_neon:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.2s, #2
-; CHECK-NEXT:    movi v3.2s, #1
-; CHECK-NEXT:    movi v4.2s, #4
-; CHECK-NEXT:    movi v5.2s, #8
-; CHECK-NEXT:    movi v6.2s, #16
-; CHECK-NEXT:    movi v7.2s, #32
-; CHECK-NEXT:    movi v16.2s, #64
-; CHECK-NEXT:    movi v17.2s, #128
-; CHECK-NEXT:    movi v18.2s, #1, lsl #8
-; CHECK-NEXT:    movi v19.2s, #2, lsl #8
-; CHECK-NEXT:    movi v20.2s, #8, lsl #8
-; CHECK-NEXT:    movi v21.2s, #128, lsl #16
-; CHECK-NEXT:    and v2.8b, v1.8b, v2.8b
-; CHECK-NEXT:    and v3.8b, v1.8b, v3.8b
-; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT:    and v5.8b, v1.8b, v5.8b
-; CHECK-NEXT:    and v6.8b, v1.8b, v6.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v7.8b
-; CHECK-NEXT:    and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT:    and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT:    and v18.8b, v1.8b, v18.8b
-; CHECK-NEXT:    mul v2.2s, v0.2s, v2.2s
-; CHECK-NEXT:    mul v3.2s, v0.2s, v3.2s
-; CHECK-NEXT:    mul v4.2s, v0.2s, v4.2s
-; CHECK-NEXT:    mul v5.2s, v0.2s, v5.2s
-; CHECK-NEXT:    mul v6.2s, v0.2s, v6.2s
-; CHECK-NEXT:    mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT:    and v21.8b, v1.8b, v21.8b
-; CHECK-NEXT:    movi v22.2s, #8, lsl #16
-; CHECK-NEXT:    movi v23.2s, #2, lsl #24
-; CHECK-NEXT:    movi v25.2s, #4, lsl #24
-; CHECK-NEXT:    movi v24.2s, #32, lsl #16
-; CHECK-NEXT:    movi v26.2s, #8, lsl #24
-; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
-; CHECK-NEXT:    eor v3.8b, v4.8b, v5.8b
-; CHECK-NEXT:    movi v4.2s, #16, lsl #8
-; CHECK-NEXT:    mul v5.2s, v0.2s, v16.2s
-; CHECK-NEXT:    mul v16.2s, v0.2s, v17.2s
-; CHECK-NEXT:    mul v17.2s, v0.2s, v18.2s
-; CHECK-NEXT:    eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v19.8b
-; CHECK-NEXT:    movi v19.2s, #32, lsl #8
-; CHECK-NEXT:    and v18.8b, v1.8b, v20.8b
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    movi v20.2s, #64, lsl #8
-; CHECK-NEXT:    mul v21.2s, v0.2s, v21.2s
-; CHECK-NEXT:    and v3.8b, v1.8b, v4.8b
-; CHECK-NEXT:    eor v5.8b, v6.8b, v5.8b
-; CHECK-NEXT:    movi v4.2s, #1, lsl #16
-; CHECK-NEXT:    eor v6.8b, v16.8b, v17.8b
-; CHECK-NEXT:    movi v16.2s, #2, lsl #16
-; CHECK-NEXT:    mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT:    mul v18.2s, v0.2s, v18.2s
-; CHECK-NEXT:    and v19.8b, v1.8b, v19.8b
-; CHECK-NEXT:    movi v17.2s, #4, lsl #8
-; CHECK-NEXT:    mul v3.2s, v0.2s, v3.2s
-; CHECK-NEXT:    eor v2.8b, v2.8b, v5.8b
-; CHECK-NEXT:    and v23.8b, v1.8b, v23.8b
-; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT:    and v5.8b, v1.8b, v16.8b
-; CHECK-NEXT:    movi v16.2s, #64, lsl #16
-; CHECK-NEXT:    eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT:    mul v7.2s, v0.2s, v19.2s
-; CHECK-NEXT:    movi v19.2s, #4, lsl #16
-; CHECK-NEXT:    and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT:    eor v3.8b, v18.8b, v3.8b
-; CHECK-NEXT:    and v18.8b, v1.8b, v20.8b
-; CHECK-NEXT:    movi v20.2s, #1, lsl #24
-; CHECK-NEXT:    mul v4.2s, v0.2s, v4.2s
-; CHECK-NEXT:    mul v5.2s, v0.2s, v5.2s
-; CHECK-NEXT:    and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT:    mul v17.2s, v0.2s, v17.2s
-; CHECK-NEXT:    eor v3.8b, v3.8b, v7.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v19.8b
-; CHECK-NEXT:    mul v18.2s, v0.2s, v18.2s
-; CHECK-NEXT:    and v20.8b, v1.8b, v20.8b
-; CHECK-NEXT:    movi v19.2s, #128, lsl #8
-; CHECK-NEXT:    mul v16.2s, v0.2s, v16.2s
-; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT:    mul v5.2s, v0.2s, v7.2s
-; CHECK-NEXT:    and v7.8b, v1.8b, v22.8b
-; CHECK-NEXT:    movi v22.2s, #16, lsl #16
-; CHECK-NEXT:    mul v20.2s, v0.2s, v20.2s
-; CHECK-NEXT:    eor v6.8b, v6.8b, v17.8b
-; CHECK-NEXT:    eor v3.8b, v3.8b, v18.8b
-; CHECK-NEXT:    and v17.8b, v1.8b, v19.8b
-; CHECK-NEXT:    mul v18.2s, v0.2s, v23.2s
-; CHECK-NEXT:    and v19.8b, v1.8b, v25.8b
-; CHECK-NEXT:    eor v16.8b, v16.8b, v21.8b
-; CHECK-NEXT:    and v21.8b, v1.8b, v24.8b
-; CHECK-NEXT:    movi v23.2s, #32, lsl #24
-; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT:    mul v5.2s, v0.2s, v7.2s
-; CHECK-NEXT:    and v7.8b, v1.8b, v22.8b
-; CHECK-NEXT:    movi v22.2s, #16, lsl #24
-; CHECK-NEXT:    movi v24.2s, #64, lsl #24
-; CHECK-NEXT:    mul v17.2s, v0.2s, v17.2s
-; CHECK-NEXT:    eor v16.8b, v16.8b, v20.8b
-; CHECK-NEXT:    and v20.8b, v1.8b, v26.8b
-; CHECK-NEXT:    mul v19.2s, v0.2s, v19.2s
-; CHECK-NEXT:    mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT:    eor v2.8b, v2.8b, v6.8b
-; CHECK-NEXT:    mul v6.2s, v0.2s, v21.2s
-; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v21.8b, v1.8b, v23.8b
-; CHECK-NEXT:    eor v5.8b, v16.8b, v18.8b
-; CHECK-NEXT:    movi v16.2s, #128, lsl #24
-; CHECK-NEXT:    mul v18.2s, v0.2s, v20.2s
-; CHECK-NEXT:    and v20.8b, v1.8b, v22.8b
-; CHECK-NEXT:    and v22.8b, v1.8b, v24.8b
-; CHECK-NEXT:    eor v3.8b, v3.8b, v17.8b
-; CHECK-NEXT:    eor v4.8b, v4.8b, v7.8b
-; CHECK-NEXT:    eor v5.8b, v5.8b, v19.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v16.8b
-; CHECK-NEXT:    mul v7.2s, v0.2s, v20.2s
-; CHECK-NEXT:    mul v16.2s, v0.2s, v21.2s
-; CHECK-NEXT:    mul v17.2s, v0.2s, v22.2s
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    eor v3.8b, v4.8b, v6.8b
-; CHECK-NEXT:    eor v4.8b, v5.8b, v18.8b
-; CHECK-NEXT:    mul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    eor v1.8b, v2.8b, v3.8b
-; CHECK-NEXT:    eor v2.8b, v4.8b, v7.8b
-; CHECK-NEXT:    eor v3.8b, v16.8b, v17.8b
-; CHECK-NEXT:    eor v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v3.8b, v0.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: clmul_v2i32_neon:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEON-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEON-NEXT:    xtn v2.4h, v1.4s
+; CHECK-NEON-NEXT:    xtn v3.4h, v0.4s
+; CHECK-NEON-NEXT:    shrn v16.4h, v0.4s, #16
+; CHECK-NEON-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEON-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEON-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEON-NEXT:    rev16 v4.8b, v2.8b
+; CHECK-NEON-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEON-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEON-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEON-NEXT:    xtn v1.8b, v3.8h
+; CHECK-NEON-NEXT:    shrn v2.8b, v2.8h, #8
+; CHECK-NEON-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEON-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEON-NEXT:    rbit v22.8b, v0.8b
+; CHECK-NEON-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEON-NEXT:    rbit v24.8b, v1.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v16.8b, v0.8b
+; CHECK-NEON-NEXT:    pmul v25.8b, v20.8b, v2.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v1.8b, v17.8b
+; CHECK-NEON-NEXT:    pmul v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEON-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEON-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEON-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEON-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEON-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEON-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEON-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEON-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v3.8b, v3.8b, v0.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEON-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    eor v2.8b, v2.8b, v3.8b
+; CHECK-NEON-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEON-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v20.8b, v0.8b
+; CHECK-NEON-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v1.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v0.8b, v1.8b, v0.8b
+; CHECK-NEON-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEON-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEON-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEON-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEON-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEON-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEON-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEON-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEON-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v1.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT:    shll v1.8h, v1.8b, #8
+; CHECK-NEON-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEON-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEON-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEON-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-AES-LABEL: clmul_v2i32_neon:
+; CHECK-AES:       // %bb.0:
+; CHECK-AES-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-AES-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-AES-NEXT:    pmull2 v2.1q, v0.2d, v1.2d
+; CHECK-AES-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-AES-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-AES-NEXT:    xtn v0.2s, v0.2d
+; CHECK-AES-NEXT:    ret
   %a = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %x, <2 x i32> %y)
   ret <2 x i32> %a
 }
@@ -1730,45 +1556,15 @@ define <1 x i128> @clmul_v1i128_neon(<1 x i128> %x, <1 x i128> %y) {
 define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: clmul_v8i16_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    mov v2.16b, v1.16b
-; CHECK-NEXT:    mov v3.16b, v1.16b
-; CHECK-NEXT:    mov v4.16b, v1.16b
-; CHECK-NEXT:    mov v5.16b, v1.16b
-; CHECK-NEXT:    mov v6.16b, v1.16b
-; CHECK-NEXT:    mov v7.16b, v1.16b
-; CHECK-NEXT:    mov v16.16b, v1.16b
-; CHECK-NEXT:    bic v1.8h, #127
-; CHECK-NEXT:    bic v2.8h, #253
-; CHECK-NEXT:    bic v3.8h, #254
-; CHECK-NEXT:    bic v4.8h, #251
-; CHECK-NEXT:    bic v5.8h, #247
-; CHECK-NEXT:    bic v6.8h, #239
-; CHECK-NEXT:    bic v7.8h, #223
-; CHECK-NEXT:    bic v16.8h, #191
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    xtn v2.8b, v2.8h
-; CHECK-NEXT:    xtn v3.8b, v3.8h
-; CHECK-NEXT:    xtn v4.8b, v4.8h
-; CHECK-NEXT:    xtn v5.8b, v5.8h
-; CHECK-NEXT:    xtn v6.8b, v6.8h
-; CHECK-NEXT:    xtn v7.8b, v7.8h
-; CHECK-NEXT:    xtn v16.8b, v16.8h
-; CHECK-NEXT:    umull v2.8h, v0.8b, v2.8b
-; CHECK-NEXT:    umull v3.8h, v0.8b, v3.8b
-; CHECK-NEXT:    umull v4.8h, v0.8b, v4.8b
-; CHECK-NEXT:    umull v5.8h, v0.8b, v5.8b
-; CHECK-NEXT:    umull v6.8h, v0.8b, v6.8b
-; CHECK-NEXT:    umull v7.8h, v0.8b, v7.8b
-; CHECK-NEXT:    umull v16.8h, v0.8b, v16.8b
-; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v4.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v16.16b
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rbit v2.8b, v1.8b
+; CHECK-NEXT:    rbit v3.8b, v0.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    rbit v2.8b, v2.8b
+; CHECK-NEXT:    ushr v1.8b, v2.8b, #1
+; CHECK-NEXT:    shll v1.8h, v1.8b, #8
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %zextx = zext <8 x i8> %x to <8 x i16>
   %zexty = zext <8 x i8> %y to <8 x i16>
@@ -1779,84 +1575,26 @@ define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
 define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: clmul_v16i16_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll2 v2.8h, v1.16b, #0
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    rbit v4.8b, v1.8b
+; CHECK-NEXT:    rbit v5.8b, v0.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    rbit v6.8b, v3.8b
+; CHECK-NEXT:    rbit v7.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v2.8b, v3.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    pmul v5.8b, v7.8b, v6.8b
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    mov v4.16b, v2.16b
-; CHECK-NEXT:    mov v5.16b, v2.16b
-; CHECK-NEXT:    mov v6.16b, v2.16b
-; CHECK-NEXT:    mov v7.16b, v2.16b
-; CHECK-NEXT:    mov v16.16b, v2.16b
-; CHECK-NEXT:    mov v17.16b, v2.16b
-; CHECK-NEXT:    mov v18.16b, v1.16b
-; CHECK-NEXT:    mov v19.16b, v1.16b
-; CHECK-NEXT:    mov v20.16b, v1.16b
-; CHECK-NEXT:    mov v21.16b, v1.16b
-; CHECK-NEXT:    mov v22.16b, v1.16b
-; CHECK-NEXT:    mov v23.16b, v1.16b
-; CHECK-NEXT:    bic v4.8h, #253
-; CHECK-NEXT:    bic v5.8h, #254
-; CHECK-NEXT:    bic v6.8h, #251
-; CHECK-NEXT:    bic v7.8h, #247
-; CHECK-NEXT:    mov v3.16b, v2.16b
-; CHECK-NEXT:    bic v16.8h, #239
-; CHECK-NEXT:    bic v17.8h, #223
-; CHECK-NEXT:    bic v18.8h, #253
-; CHECK-NEXT:    bic v19.8h, #254
-; CHECK-NEXT:    bic v20.8h, #251
-; CHECK-NEXT:    bic v21.8h, #247
-; CHECK-NEXT:    bic v22.8h, #239
-; CHECK-NEXT:    bic v23.8h, #223
-; CHECK-NEXT:    mov v24.16b, v1.16b
-; CHECK-NEXT:    uzp1 v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    uzp1 v5.16b, v0.16b, v5.16b
-; CHECK-NEXT:    uzp1 v6.16b, v0.16b, v6.16b
-; CHECK-NEXT:    uzp1 v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    bic v3.8h, #191
-; CHECK-NEXT:    uzp1 v16.16b, v0.16b, v16.16b
-; CHECK-NEXT:    uzp1 v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    xtn v18.8b, v18.8h
-; CHECK-NEXT:    xtn v19.8b, v19.8h
-; CHECK-NEXT:    xtn v20.8b, v20.8h
-; CHECK-NEXT:    xtn v21.8b, v21.8h
-; CHECK-NEXT:    xtn v22.8b, v22.8h
-; CHECK-NEXT:    xtn v23.8b, v23.8h
-; CHECK-NEXT:    bic v24.8h, #191
-; CHECK-NEXT:    umull2 v4.8h, v0.16b, v4.16b
-; CHECK-NEXT:    umull2 v5.8h, v0.16b, v5.16b
-; CHECK-NEXT:    umull2 v6.8h, v0.16b, v6.16b
-; CHECK-NEXT:    umull2 v7.8h, v0.16b, v7.16b
-; CHECK-NEXT:    uzp1 v3.16b, v0.16b, v3.16b
-; CHECK-NEXT:    umull2 v16.8h, v0.16b, v16.16b
-; CHECK-NEXT:    umull2 v17.8h, v0.16b, v17.16b
-; CHECK-NEXT:    umull v18.8h, v0.8b, v18.8b
-; CHECK-NEXT:    xtn v24.8b, v24.8h
-; CHECK-NEXT:    umull v19.8h, v0.8b, v19.8b
-; CHECK-NEXT:    umull v20.8h, v0.8b, v20.8b
-; CHECK-NEXT:    umull v21.8h, v0.8b, v21.8b
-; CHECK-NEXT:    umull v22.8h, v0.8b, v22.8b
-; CHECK-NEXT:    umull v23.8h, v0.8b, v23.8b
-; CHECK-NEXT:    bic v2.8h, #127
-; CHECK-NEXT:    bic v1.8h, #127
-; CHECK-NEXT:    eor v4.16b, v5.16b, v4.16b
-; CHECK-NEXT:    eor v5.16b, v6.16b, v7.16b
-; CHECK-NEXT:    umull2 v3.8h, v0.16b, v3.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v17.16b
-; CHECK-NEXT:    umull v7.8h, v0.8b, v24.8b
-; CHECK-NEXT:    eor v16.16b, v19.16b, v18.16b
-; CHECK-NEXT:    eor v17.16b, v20.16b, v21.16b
-; CHECK-NEXT:    eor v18.16b, v22.16b, v23.16b
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v3.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v5.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v6.16b, v18.16b, v7.16b
-; CHECK-NEXT:    umull2 v2.8h, v0.16b, v2.16b
-; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    eor v1.16b, v4.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v5.16b, v6.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEXT:    ushr v2.8b, v4.8b, #1
+; CHECK-NEXT:    ushr v3.8b, v5.8b, #1
+; CHECK-NEXT:    shll v2.8h, v2.8b, #8
+; CHECK-NEXT:    shll v3.8h, v3.8b, #8
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    orr v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    ret
   %zextx = zext <16 x i8> %x to <16 x i16>
   %zexty = zext <16 x i8> %y to <16 x i16>
@@ -1867,86 +1605,74 @@ define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) {
 define <4 x i32> @clmul_v4i32_neon_zext(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: clmul_v4i32_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #2
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    movi v4.4s, #4
-; CHECK-NEXT:    movi v5.4s, #8
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    movi v6.4s, #16
-; CHECK-NEXT:    movi v7.4s, #32
-; CHECK-NEXT:    movi v16.4s, #128
-; CHECK-NEXT:    movi v17.4s, #1, lsl #8
-; CHECK-NEXT:    movi v18.4s, #8, lsl #8
-; CHECK-NEXT:    movi v19.4s, #16, lsl #8
-; CHECK-NEXT:    movi v20.4s, #64
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    movi v21.4s, #2, lsl #8
-; CHECK-NEXT:    movi v22.4s, #32, lsl #8
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    xtn v4.4h, v4.4s
-; CHECK-NEXT:    xtn v5.4h, v5.4s
-; CHECK-NEXT:    movi v23.4s, #4, lsl #8
-; CHECK-NEXT:    movi v24.4s, #64, lsl #8
-; CHECK-NEXT:    xtn v6.4h, v6.4s
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    xtn v16.4h, v16.4s
-; CHECK-NEXT:    xtn v17.4h, v17.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    xtn v18.4h, v18.4s
-; CHECK-NEXT:    xtn v19.4h, v19.4s
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v5.4s, v0.4h, v5.4h
-; CHECK-NEXT:    movi v25.4s, #128, lsl #8
-; CHECK-NEXT:    xtn v20.4h, v20.4s
-; CHECK-NEXT:    xtn v21.4h, v21.4s
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    xtn v22.4h, v22.4s
-; CHECK-NEXT:    and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT:    umull v6.4s, v0.4h, v6.4h
-; CHECK-NEXT:    umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT:    umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT:    umull v17.4s, v0.4h, v17.4h
-; CHECK-NEXT:    umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT:    umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v25.16b
-; CHECK-NEXT:    xtn v4.4h, v23.4s
-; CHECK-NEXT:    xtn v5.4h, v24.4s
-; CHECK-NEXT:    umull v20.4s, v0.4h, v20.4h
-; CHECK-NEXT:    umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT:    umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    umull v3.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v5.4h
-; CHECK-NEXT:    eor v5.16b, v6.16b, v20.16b
-; CHECK-NEXT:    eor v6.16b, v7.16b, v21.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v22.16b
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v2.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v4.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    rev16 v3.8b, v1.8b
+; CHECK-NEXT:    rev16 v4.8b, v0.8b
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    xtn v17.8b, v1.8h
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v3.8b, v3.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    xtn v18.8b, v2.8h
+; CHECK-NEXT:    rbit v19.8b, v17.8b
+; CHECK-NEXT:    shrn v2.8b, v2.8h, #8
+; CHECK-NEXT:    xtn v5.8b, v3.8h
+; CHECK-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEXT:    rbit v20.8b, v18.8b
+; CHECK-NEXT:    rbit v7.8b, v5.8b
+; CHECK-NEXT:    rbit v16.8b, v6.8b
+; CHECK-NEXT:    pmul v4.8b, v4.8b, v5.8b
+; CHECK-NEXT:    pmul v3.8b, v6.8b, v3.8b
+; CHECK-NEXT:    pmul v5.8b, v6.8b, v5.8b
+; CHECK-NEXT:    pmul v6.8b, v2.8b, v17.8b
+; CHECK-NEXT:    pmul v7.8b, v16.8b, v7.8b
+; CHECK-NEXT:    xtn v16.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    eor v3.8b, v3.8b, v4.8b
+; CHECK-NEXT:    pmul v4.8b, v20.8b, v19.8b
+; CHECK-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-NEXT:    rbit v7.8b, v7.8b
+; CHECK-NEXT:    rbit v21.8b, v16.8b
+; CHECK-NEXT:    pmul v2.8b, v16.8b, v2.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEXT:    pmul v20.8b, v21.8b, v20.8b
+; CHECK-NEXT:    pmul v19.8b, v21.8b, v19.8b
+; CHECK-NEXT:    ushr v4.8b, v4.8b, #1
+; CHECK-NEXT:    eor v3.8b, v7.8b, v3.8b
+; CHECK-NEXT:    pmul v7.8b, v18.8b, v1.8b
+; CHECK-NEXT:    pmul v18.8b, v0.8b, v18.8b
+; CHECK-NEXT:    rbit v20.8b, v20.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v17.8b
+; CHECK-NEXT:    pmul v1.8b, v16.8b, v1.8b
+; CHECK-NEXT:    shll v3.8h, v3.8b, #8
+; CHECK-NEXT:    eor v6.8b, v7.8b, v6.8b
+; CHECK-NEXT:    eor v2.8b, v2.8b, v18.8b
+; CHECK-NEXT:    ushr v7.8b, v20.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    orr v3.16b, v5.16b, v3.16b
+; CHECK-NEXT:    rbit v5.8b, v19.8b
+; CHECK-NEXT:    eor v4.8b, v4.8b, v6.8b
+; CHECK-NEXT:    eor v2.8b, v7.8b, v2.8b
+; CHECK-NEXT:    rev16 v3.8b, v3.8b
+; CHECK-NEXT:    ushr v1.8b, v5.8b, #1
+; CHECK-NEXT:    pmul v5.8b, v16.8b, v17.8b
+; CHECK-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEXT:    shll v2.8h, v2.8b, #8
+; CHECK-NEXT:    rbit v3.8b, v3.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v1.8b, v2.8b, v4.8b
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    ushr v2.4h, v3.4h, #1
+; CHECK-NEXT:    ushll v3.8h, v5.8b, #0
+; CHECK-NEXT:    eor v1.8b, v2.8b, v1.8b
+; CHECK-NEXT:    orr v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %zextx = zext <4 x i16> %x to <4 x i32>
   %zexty = zext <4 x i16> %y to <4 x i32>
@@ -1966,152 +1692,138 @@ define <8 x i32> @clmul_v8i32_neon_zext(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-NEXT:    .cfi_offset b10, -24
 ; CHECK-NEXT:    .cfi_offset b11, -32
 ; CHECK-NEXT:    .cfi_offset b12, -48
-; CHECK-NEXT:    movi v19.4s, #2
-; CHECK-NEXT:    movi v21.4s, #1
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    movi v17.4s, #4
-; CHECK-NEXT:    movi v20.4s, #8
-; CHECK-NEXT:    movi v5.4s, #16
-; CHECK-NEXT:    movi v4.4s, #32
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    and v3.16b, v2.16b, v19.16b
-; CHECK-NEXT:    and v6.16b, v2.16b, v21.16b
-; CHECK-NEXT:    and v7.16b, v2.16b, v17.16b
-; CHECK-NEXT:    and v16.16b, v2.16b, v20.16b
-; CHECK-NEXT:    and v18.16b, v2.16b, v5.16b
-; CHECK-NEXT:    and v22.16b, v2.16b, v4.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    uzp1 v23.8h, v0.8h, v3.8h
-; CHECK-NEXT:    movi v3.4s, #64
-; CHECK-NEXT:    uzp1 v24.8h, v0.8h, v6.8h
-; CHECK-NEXT:    movi v6.4s, #128
-; CHECK-NEXT:    uzp1 v25.8h, v0.8h, v7.8h
-; CHECK-NEXT:    movi v7.4s, #1, lsl #8
-; CHECK-NEXT:    uzp1 v26.8h, v0.8h, v16.8h
-; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v18.8h
-; CHECK-NEXT:    uzp1 v28.8h, v0.8h, v22.8h
-; CHECK-NEXT:    movi v16.4s, #8, lsl #8
-; CHECK-NEXT:    movi v18.4s, #16, lsl #8
-; CHECK-NEXT:    movi v22.4s, #2, lsl #8
-; CHECK-NEXT:    umull2 v29.4s, v0.8h, v23.8h
-; CHECK-NEXT:    and v23.16b, v2.16b, v3.16b
-; CHECK-NEXT:    umull2 v24.4s, v0.8h, v24.8h
-; CHECK-NEXT:    and v30.16b, v2.16b, v6.16b
-; CHECK-NEXT:    and v31.16b, v2.16b, v7.16b
-; CHECK-NEXT:    umull2 v25.4s, v0.8h, v25.8h
-; CHECK-NEXT:    umull2 v26.4s, v0.8h, v26.8h
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v27.8h
-; CHECK-NEXT:    umull2 v28.4s, v0.8h, v28.8h
-; CHECK-NEXT:    uzp1 v10.8h, v0.8h, v23.8h
-; CHECK-NEXT:    movi v23.4s, #32, lsl #8
-; CHECK-NEXT:    and v8.16b, v2.16b, v16.16b
-; CHECK-NEXT:    and v9.16b, v2.16b, v18.16b
-; CHECK-NEXT:    uzp1 v30.8h, v0.8h, v30.8h
-; CHECK-NEXT:    uzp1 v31.8h, v0.8h, v31.8h
-; CHECK-NEXT:    and v11.16b, v2.16b, v22.16b
-; CHECK-NEXT:    eor v24.16b, v24.16b, v29.16b
-; CHECK-NEXT:    xtn v12.4h, v19.4s
-; CHECK-NEXT:    uzp1 v8.8h, v0.8h, v8.8h
-; CHECK-NEXT:    eor v25.16b, v25.16b, v26.16b
-; CHECK-NEXT:    eor v26.16b, v27.16b, v28.16b
-; CHECK-NEXT:    uzp1 v9.8h, v0.8h, v9.8h
-; CHECK-NEXT:    and v29.16b, v2.16b, v23.16b
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v10.8h
-; CHECK-NEXT:    umull2 v28.4s, v0.8h, v30.8h
-; CHECK-NEXT:    uzp1 v30.8h, v0.8h, v11.8h
-; CHECK-NEXT:    umull2 v31.4s, v0.8h, v31.8h
-; CHECK-NEXT:    and v11.16b, v1.16b, v17.16b
-; CHECK-NEXT:    eor v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    and v10.16b, v1.16b, v21.16b
-; CHECK-NEXT:    uzp1 v29.8h, v0.8h, v29.8h
-; CHECK-NEXT:    umull2 v8.4s, v0.8h, v8.8h
-; CHECK-NEXT:    movi v21.4s, #4, lsl #8
-; CHECK-NEXT:    umull2 v9.4s, v0.8h, v9.8h
-; CHECK-NEXT:    eor v19.16b, v26.16b, v27.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    umull2 v24.4s, v0.8h, v30.8h
-; CHECK-NEXT:    eor v25.16b, v28.16b, v31.16b
-; CHECK-NEXT:    xtn v28.4h, v11.4s
-; CHECK-NEXT:    xtn v30.4h, v20.4s
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v29.8h
-; CHECK-NEXT:    xtn v10.4h, v10.4s
-; CHECK-NEXT:    and v29.16b, v2.16b, v21.16b
-; CHECK-NEXT:    eor v26.16b, v8.16b, v9.16b
-; CHECK-NEXT:    and v9.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    eor v20.16b, v25.16b, v24.16b
-; CHECK-NEXT:    and v25.16b, v1.16b, v5.16b
-; CHECK-NEXT:    umull v28.4s, v0.4h, v28.4h
-; CHECK-NEXT:    umull v30.4s, v0.4h, v30.4h
-; CHECK-NEXT:    movi v24.4s, #64, lsl #8
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    eor v4.16b, v26.16b, v27.16b
-; CHECK-NEXT:    and v26.16b, v1.16b, v6.16b
-; CHECK-NEXT:    xtn v27.4h, v9.4s
-; CHECK-NEXT:    xtn v25.4h, v25.4s
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    xtn v16.4h, v16.4s
-; CHECK-NEXT:    xtn v18.4h, v18.4s
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    uzp1 v5.8h, v0.8h, v29.8h
-; CHECK-NEXT:    xtn v26.4h, v26.4s
-; CHECK-NEXT:    eor v28.16b, v28.16b, v30.16b
-; CHECK-NEXT:    movi v30.4s, #128, lsl #8
-; CHECK-NEXT:    umull v27.4s, v0.4h, v27.4h
-; CHECK-NEXT:    and v29.16b, v2.16b, v24.16b
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    umull v25.4s, v0.4h, v25.4h
-; CHECK-NEXT:    xtn v22.4h, v22.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    xtn v23.4h, v23.4s
-; CHECK-NEXT:    and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT:    umull v31.4s, v0.4h, v12.4h
-; CHECK-NEXT:    umull v8.4s, v0.4h, v10.4h
-; CHECK-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    umull v26.4s, v0.4h, v26.4h
-; CHECK-NEXT:    umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT:    umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT:    umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT:    eor v25.16b, v25.16b, v27.16b
-; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v29.8h
-; CHECK-NEXT:    and v2.16b, v2.16b, v30.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v30.16b
-; CHECK-NEXT:    xtn v21.4h, v21.4s
-; CHECK-NEXT:    xtn v24.4h, v24.4s
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT:    umull v23.4s, v0.4h, v23.4h
-; CHECK-NEXT:    eor v6.16b, v8.16b, v31.16b
-; CHECK-NEXT:    eor v7.16b, v26.16b, v7.16b
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    rev16 v5.8b, v1.8b
+; CHECK-NEXT:    rev16 v6.8b, v0.8b
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    rev16 v7.8b, v3.8b
+; CHECK-NEXT:    rev16 v17.8b, v2.8b
+; CHECK-NEXT:    rbit v18.8b, v5.8b
+; CHECK-NEXT:    rbit v19.8b, v6.8b
+; CHECK-NEXT:    xtn v5.8b, v1.8h
+; CHECK-NEXT:    xtn v16.8b, v4.8h
+; CHECK-NEXT:    shrn v29.8b, v4.8h, #8
+; CHECK-NEXT:    xtn v6.8b, v0.8h
+; CHECK-NEXT:    shrn v4.8b, v0.8h, #8
+; CHECK-NEXT:    xtn v0.8b, v3.8h
+; CHECK-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEXT:    rbit v20.8b, v7.8b
+; CHECK-NEXT:    rbit v17.8b, v17.8b
+; CHECK-NEXT:    xtn v21.8b, v18.8h
+; CHECK-NEXT:    xtn v22.8b, v19.8h
+; CHECK-NEXT:    shrn v7.8b, v1.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v19.8h, #8
+; CHECK-NEXT:    shrn v18.8b, v18.8h, #8
+; CHECK-NEXT:    pmul v8.8b, v29.8b, v5.8b
+; CHECK-NEXT:    rbit v23.8b, v5.8b
+; CHECK-NEXT:    rbit v24.8b, v16.8b
+; CHECK-NEXT:    pmul v12.8b, v4.8b, v16.8b
+; CHECK-NEXT:    pmul v4.8b, v4.8b, v5.8b
+; CHECK-NEXT:    xtn v25.8b, v20.8h
+; CHECK-NEXT:    xtn v26.8b, v17.8h
+; CHECK-NEXT:    rbit v27.8b, v21.8b
+; CHECK-NEXT:    rbit v28.8b, v22.8b
+; CHECK-NEXT:    pmul v10.8b, v1.8b, v21.8b
+; CHECK-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEXT:    pmul v18.8b, v22.8b, v18.8b
+; CHECK-NEXT:    shrn v20.8b, v20.8h, #8
+; CHECK-NEXT:    pmul v9.8b, v16.8b, v7.8b
+; CHECK-NEXT:    xtn v1.8b, v2.8h
+; CHECK-NEXT:    pmul v21.8b, v22.8b, v21.8b
+; CHECK-NEXT:    pmul v19.8b, v24.8b, v23.8b
+; CHECK-NEXT:    rbit v30.8b, v25.8b
+; CHECK-NEXT:    rbit v31.8b, v26.8b
+; CHECK-NEXT:    pmul v17.8b, v17.8b, v25.8b
+; CHECK-NEXT:    pmul v27.8b, v28.8b, v27.8b
+; CHECK-NEXT:    pmul v20.8b, v26.8b, v20.8b
+; CHECK-NEXT:    rbit v28.8b, v6.8b
+; CHECK-NEXT:    eor v18.8b, v18.8b, v10.8b
+; CHECK-NEXT:    eor v8.8b, v9.8b, v8.8b
+; CHECK-NEXT:    rbit v9.8b, v0.8b
+; CHECK-NEXT:    rbit v10.8b, v1.8b
+; CHECK-NEXT:    pmul v22.8b, v26.8b, v25.8b
+; CHECK-NEXT:    shrn v2.8b, v2.8h, #8
+; CHECK-NEXT:    pmul v30.8b, v31.8b, v30.8b
+; CHECK-NEXT:    ushll v21.8h, v21.8b, #0
+; CHECK-NEXT:    rbit v19.8b, v19.8b
+; CHECK-NEXT:    rbit v27.8b, v27.8b
+; CHECK-NEXT:    eor v17.8b, v20.8b, v17.8b
+; CHECK-NEXT:    pmul v11.8b, v28.8b, v24.8b
+; CHECK-NEXT:    pmul v25.8b, v24.8b, v9.8b
+; CHECK-NEXT:    pmul v31.8b, v6.8b, v29.8b
+; CHECK-NEXT:    pmul v7.8b, v6.8b, v7.8b
+; CHECK-NEXT:    pmul v24.8b, v10.8b, v24.8b
+; CHECK-NEXT:    ushll v22.8h, v22.8b, #0
+; CHECK-NEXT:    pmul v5.8b, v6.8b, v5.8b
+; CHECK-NEXT:    rbit v30.8b, v30.8b
+; CHECK-NEXT:    ushr v19.8b, v19.8b, #1
+; CHECK-NEXT:    ushr v27.8b, v27.8b, #1
+; CHECK-NEXT:    rbit v11.8b, v11.8b
+; CHECK-NEXT:    rbit v25.8b, v25.8b
+; CHECK-NEXT:    eor v31.8b, v31.8b, v12.8b
+; CHECK-NEXT:    eor v4.8b, v7.8b, v4.8b
+; CHECK-NEXT:    rbit v24.8b, v24.8b
+; CHECK-NEXT:    eor v19.8b, v19.8b, v8.8b
+; CHECK-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-NEXT:    eor v18.8b, v27.8b, v18.8b
+; CHECK-NEXT:    ushr v20.8b, v30.8b, #1
+; CHECK-NEXT:    pmul v27.8b, v16.8b, v3.8b
+; CHECK-NEXT:    pmul v16.8b, v2.8b, v16.8b
+; CHECK-NEXT:    pmul v2.8b, v2.8b, v0.8b
+; CHECK-NEXT:    pmul v3.8b, v1.8b, v3.8b
+; CHECK-NEXT:    ushr v26.8b, v11.8b, #1
+; CHECK-NEXT:    shll v19.8h, v19.8b, #8
+; CHECK-NEXT:    shll v18.8h, v18.8b, #8
+; CHECK-NEXT:    eor v17.8b, v20.8b, v17.8b
+; CHECK-NEXT:    pmul v20.8b, v28.8b, v23.8b
+; CHECK-NEXT:    pmul v28.8b, v1.8b, v29.8b
+; CHECK-NEXT:    pmul v23.8b, v29.8b, v0.8b
+; CHECK-NEXT:    ushr v24.8b, v24.8b, #1
+; CHECK-NEXT:    eor v26.8b, v26.8b, v31.8b
+; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    shll v17.8h, v17.8b, #8
+; CHECK-NEXT:    orr v18.16b, v21.16b, v18.16b
+; CHECK-NEXT:    pmul v21.8b, v10.8b, v9.8b
 ; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    eor v16.16b, v16.16b, v18.16b
-; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    umull2 v5.4s, v0.8h, v5.8h
-; CHECK-NEXT:    umull2 v18.4s, v0.8h, v27.8h
-; CHECK-NEXT:    umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT:    umull v24.4s, v0.4h, v24.4h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v28.16b
-; CHECK-NEXT:    eor v3.16b, v25.16b, v3.16b
-; CHECK-NEXT:    eor v7.16b, v7.16b, v22.16b
-; CHECK-NEXT:    eor v16.16b, v16.16b, v23.16b
-; CHECK-NEXT:    eor v17.16b, v17.16b, v19.16b
-; CHECK-NEXT:    umull2 v2.4s, v0.8h, v2.8h
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    eor v5.16b, v20.16b, v5.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v18.16b
-; CHECK-NEXT:    eor v1.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v21.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v24.16b
-; CHECK-NEXT:    eor v5.16b, v17.16b, v5.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    eor v0.16b, v6.16b, v0.16b
-; CHECK-NEXT:    eor v1.16b, v5.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    rbit v20.8b, v20.8b
+; CHECK-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    eor v16.8b, v28.8b, v16.8b
+; CHECK-NEXT:    orr v17.16b, v22.16b, v17.16b
+; CHECK-NEXT:    eor v23.8b, v27.8b, v23.8b
+; CHECK-NEXT:    ushr v22.8b, v25.8b, #1
+; CHECK-NEXT:    rbit v21.8b, v21.8b
+; CHECK-NEXT:    rev16 v18.8b, v18.8b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushr v7.8b, v20.8b, #1
+; CHECK-NEXT:    eor v16.8b, v24.8b, v16.8b
+; CHECK-NEXT:    rev16 v17.8b, v17.8b
+; CHECK-NEXT:    eor v20.8b, v22.8b, v23.8b
+; CHECK-NEXT:    shll v22.8h, v26.8b, #8
+; CHECK-NEXT:    ushr v3.8b, v21.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEXT:    eor v4.8b, v7.8b, v4.8b
+; CHECK-NEXT:    shll v7.8h, v16.8b, #8
+; CHECK-NEXT:    shll v6.8h, v20.8b, #8
+; CHECK-NEXT:    rbit v16.8b, v17.8b
+; CHECK-NEXT:    eor v1.8b, v3.8b, v2.8b
+; CHECK-NEXT:    eor v2.8b, v22.8b, v19.8b
+; CHECK-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEXT:    ushr v3.4h, v18.4h, #1
+; CHECK-NEXT:    eor v6.8b, v7.8b, v6.8b
+; CHECK-NEXT:    ushr v7.4h, v16.4h, #1
+; CHECK-NEXT:    shll v1.8h, v1.8b, #8
+; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    orr v3.16b, v5.16b, v4.16b
+; CHECK-NEXT:    eor v4.8b, v7.8b, v6.8b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-NEXT:    orr v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    orr v1.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    ldr d12, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %zextx = zext <8 x i16> %x to <8 x i32>
@@ -4525,72 +4237,26 @@ define <8 x i16> @clmulr_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; CHECK-LABEL: clmulr_v8i16_neon:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev16 v1.16b, v1.16b
-; CHECK-NEXT:    rev16 v3.16b, v0.16b
-; CHECK-NEXT:    movi v2.8h, #2
-; CHECK-NEXT:    movi v4.8h, #1
-; CHECK-NEXT:    movi v5.8h, #4
-; CHECK-NEXT:    movi v6.8h, #8
-; CHECK-NEXT:    movi v7.8h, #16
-; CHECK-NEXT:    movi v16.8h, #32
-; CHECK-NEXT:    movi v17.8h, #128
-; CHECK-NEXT:    movi v18.8h, #1, lsl #8
-; CHECK-NEXT:    movi v19.8h, #8, lsl #8
-; CHECK-NEXT:    movi v20.8h, #16, lsl #8
-; CHECK-NEXT:    rbit v0.16b, v1.16b
-; CHECK-NEXT:    rbit v1.16b, v3.16b
-; CHECK-NEXT:    movi v3.8h, #64
-; CHECK-NEXT:    movi v21.8h, #2, lsl #8
-; CHECK-NEXT:    movi v22.8h, #32, lsl #8
-; CHECK-NEXT:    movi v23.8h, #4, lsl #8
-; CHECK-NEXT:    movi v24.8h, #64, lsl #8
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT:    mul v2.8h, v1.8h, v2.8h
-; CHECK-NEXT:    mul v4.8h, v1.8h, v4.8h
-; CHECK-NEXT:    mul v5.8h, v1.8h, v5.8h
-; CHECK-NEXT:    mul v6.8h, v1.8h, v6.8h
-; CHECK-NEXT:    mul v7.8h, v1.8h, v7.8h
-; CHECK-NEXT:    mul v16.8h, v1.8h, v16.8h
-; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT:    mul v17.8h, v1.8h, v17.8h
-; CHECK-NEXT:    mul v18.8h, v1.8h, v18.8h
-; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT:    mul v19.8h, v1.8h, v19.8h
-; CHECK-NEXT:    mul v20.8h, v1.8h, v20.8h
-; CHECK-NEXT:    and v22.16b, v0.16b, v22.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT:    movi v6.8h, #128, lsl #8
-; CHECK-NEXT:    mul v3.8h, v1.8h, v3.8h
-; CHECK-NEXT:    mul v5.8h, v1.8h, v21.8h
-; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT:    and v23.16b, v0.16b, v24.16b
-; CHECK-NEXT:    mul v22.8h, v1.8h, v22.8h
-; CHECK-NEXT:    eor v7.16b, v7.16b, v16.16b
-; CHECK-NEXT:    eor v16.16b, v17.16b, v18.16b
-; CHECK-NEXT:    eor v17.16b, v19.16b, v20.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v6.16b
-; CHECK-NEXT:    mul v4.8h, v1.8h, v21.8h
-; CHECK-NEXT:    mul v6.8h, v1.8h, v23.8h
-; CHECK-NEXT:    eor v3.16b, v7.16b, v3.16b
-; CHECK-NEXT:    eor v5.16b, v16.16b, v5.16b
-; CHECK-NEXT:    eor v7.16b, v17.16b, v22.16b
-; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v5.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v6.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-NEXT:    rbit v1.16b, v1.16b
+; CHECK-NEXT:    rbit v0.16b, v0.16b
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    rev16 v0.16b, v0.16b
 ; CHECK-NEXT:    rbit v0.16b, v0.16b
 ; CHECK-NEXT:    ret
@@ -4605,87 +4271,29 @@ define <8 x i16> @clmulr_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i16> @clmulr_v4i16_neon(<4 x i16> %a, <4 x i16> %b) nounwind {
 ; CHECK-LABEL: clmulr_v4i16_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #2
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    movi v4.4s, #4
-; CHECK-NEXT:    movi v5.4s, #8
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    movi v6.4s, #16
-; CHECK-NEXT:    movi v7.4s, #32
-; CHECK-NEXT:    movi v16.4s, #128
-; CHECK-NEXT:    movi v17.4s, #1, lsl #8
-; CHECK-NEXT:    movi v18.4s, #8, lsl #8
-; CHECK-NEXT:    movi v19.4s, #16, lsl #8
-; CHECK-NEXT:    movi v20.4s, #64
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    movi v21.4s, #2, lsl #8
-; CHECK-NEXT:    movi v22.4s, #32, lsl #8
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    xtn v4.4h, v4.4s
-; CHECK-NEXT:    xtn v5.4h, v5.4s
-; CHECK-NEXT:    movi v23.4s, #4, lsl #8
-; CHECK-NEXT:    movi v24.4s, #64, lsl #8
-; CHECK-NEXT:    xtn v6.4h, v6.4s
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    xtn v16.4h, v16.4s
-; CHECK-NEXT:    xtn v17.4h, v17.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    xtn v18.4h, v18.4s
-; CHECK-NEXT:    xtn v19.4h, v19.4s
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v5.4s, v0.4h, v5.4h
-; CHECK-NEXT:    movi v25.4s, #128, lsl #8
-; CHECK-NEXT:    xtn v20.4h, v20.4s
-; CHECK-NEXT:    xtn v21.4h, v21.4s
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    xtn v22.4h, v22.4s
-; CHECK-NEXT:    and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT:    umull v6.4s, v0.4h, v6.4h
-; CHECK-NEXT:    umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT:    umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT:    umull v17.4s, v0.4h, v17.4h
-; CHECK-NEXT:    umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT:    umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v25.16b
-; CHECK-NEXT:    xtn v4.4h, v23.4s
-; CHECK-NEXT:    xtn v5.4h, v24.4s
-; CHECK-NEXT:    umull v20.4s, v0.4h, v20.4h
-; CHECK-NEXT:    umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT:    umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    umull v3.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v5.4h
-; CHECK-NEXT:    eor v5.16b, v6.16b, v20.16b
-; CHECK-NEXT:    eor v6.16b, v7.16b, v21.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v22.16b
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v2.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v4.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    shrn v0.4h, v0.4s, #15
+; CHECK-NEXT:    rev16 v1.8b, v1.8b
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    rbit v1.8b, v1.8b
+; CHECK-NEXT:    rbit v0.8b, v0.8b
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    rbit v0.8b, v0.8b
 ; CHECK-NEXT:    ret
   %a.ext = zext <4 x i16> %a to <4 x i32>
   %b.ext = zext <4 x i16> %b to <4 x i32>
@@ -4699,136 +4307,87 @@ define <4 x i32> @clmulr_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; CHECK-LABEL: clmulr_v4i32_neon:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev32 v1.16b, v1.16b
-; CHECK-NEXT:    rev32 v2.16b, v0.16b
-; CHECK-NEXT:    movi v3.4s, #2
-; CHECK-NEXT:    movi v4.4s, #1
-; CHECK-NEXT:    movi v5.4s, #4
-; CHECK-NEXT:    movi v6.4s, #8
-; CHECK-NEXT:    movi v7.4s, #16
-; CHECK-NEXT:    movi v16.4s, #32
-; CHECK-NEXT:    movi v17.4s, #64
-; CHECK-NEXT:    movi v18.4s, #1, lsl #8
-; CHECK-NEXT:    movi v19.4s, #2, lsl #8
-; CHECK-NEXT:    movi v20.4s, #8, lsl #8
-; CHECK-NEXT:    rbit v0.16b, v1.16b
-; CHECK-NEXT:    rbit v1.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #128
-; CHECK-NEXT:    movi v21.4s, #16, lsl #8
-; CHECK-NEXT:    movi v22.4s, #8, lsl #16
-; CHECK-NEXT:    movi v23.4s, #2, lsl #24
-; CHECK-NEXT:    movi v25.4s, #4, lsl #24
-; CHECK-NEXT:    movi v24.4s, #32, lsl #16
-; CHECK-NEXT:    movi v26.4s, #8, lsl #24
-; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    mul v5.4s, v1.4s, v5.4s
-; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    mul v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    and v23.16b, v0.16b, v23.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT:    eor v6.16b, v7.16b, v16.16b
-; CHECK-NEXT:    mul v5.4s, v1.4s, v18.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v19.16b
-; CHECK-NEXT:    movi v18.4s, #32, lsl #8
-; CHECK-NEXT:    and v16.16b, v0.16b, v20.16b
-; CHECK-NEXT:    movi v19.4s, #1, lsl #16
-; CHECK-NEXT:    movi v20.4s, #4, lsl #8
-; CHECK-NEXT:    eor v3.16b, v3.16b, v4.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v21.16b
-; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT:    movi v17.4s, #2, lsl #16
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    eor v5.16b, v2.16b, v5.16b
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    movi v21.4s, #64, lsl #8
-; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    eor v2.16b, v3.16b, v6.16b
-; CHECK-NEXT:    and v3.16b, v0.16b, v19.16b
-; CHECK-NEXT:    movi v19.4s, #128, lsl #16
-; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v17.16b
-; CHECK-NEXT:    movi v17.4s, #64, lsl #16
-; CHECK-NEXT:    eor v5.16b, v5.16b, v7.16b
-; CHECK-NEXT:    mul v7.4s, v1.4s, v18.4s
-; CHECK-NEXT:    movi v18.4s, #4, lsl #16
-; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    eor v4.16b, v16.16b, v4.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v21.16b
-; CHECK-NEXT:    movi v21.4s, #1, lsl #24
-; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT:    mul v20.4s, v1.4s, v20.4s
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT:    movi v18.4s, #128, lsl #8
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    eor v5.16b, v5.16b, v20.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #16
-; CHECK-NEXT:    mul v21.4s, v1.4s, v21.4s
-; CHECK-NEXT:    eor v4.16b, v4.16b, v16.16b
-; CHECK-NEXT:    and v20.16b, v0.16b, v24.16b
-; CHECK-NEXT:    movi v24.4s, #64, lsl #24
-; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v16.16b, v17.16b, v19.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v18.4s, v1.4s, v23.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT:    and v19.16b, v0.16b, v25.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #24
-; CHECK-NEXT:    movi v23.4s, #32, lsl #24
-; CHECK-NEXT:    eor v16.16b, v16.16b, v21.16b
-; CHECK-NEXT:    and v21.16b, v0.16b, v26.16b
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    mul v5.4s, v1.4s, v20.4s
-; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v18.16b
-; CHECK-NEXT:    movi v16.4s, #128, lsl #24
-; CHECK-NEXT:    mul v18.4s, v1.4s, v21.4s
-; CHECK-NEXT:    and v20.16b, v0.16b, v22.16b
-; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT:    and v22.16b, v0.16b, v24.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v17.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    eor v6.16b, v6.16b, v19.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    mul v7.4s, v1.4s, v20.4s
-; CHECK-NEXT:    mul v16.4s, v1.4s, v21.4s
-; CHECK-NEXT:    mul v17.4s, v1.4s, v22.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v5.16b
-; CHECK-NEXT:    eor v4.16b, v6.16b, v18.16b
-; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev32 v0.16b, v0.16b
+; CHECK-NEXT:    rbit v1.16b, v1.16b
+; CHECK-NEXT:    rbit v2.16b, v0.16b
+; CHECK-NEXT:    xtn v0.4h, v1.4s
+; CHECK-NEXT:    xtn v3.4h, v2.4s
+; CHECK-NEXT:    shrn v16.4h, v2.4s, #16
+; CHECK-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEXT:    rev16 v4.8b, v0.8b
+; CHECK-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEXT:    xtn v1.8b, v0.8h
+; CHECK-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEXT:    xtn v2.8b, v3.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEXT:    rbit v22.8b, v1.8b
+; CHECK-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEXT:    rbit v24.8b, v2.8b
+; CHECK-NEXT:    pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEXT:    pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEXT:    pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEXT:    pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEXT:    pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEXT:    eor v0.8b, v0.8b, v3.8b
+; CHECK-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEXT:    pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEXT:    pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEXT:    pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v3.8b, v0.8b
+; CHECK-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    rev32 v0.16b, v0.16b
 ; CHECK-NEXT:    rbit v0.16b, v0.16b
 ; CHECK-NEXT:    ret
@@ -4843,209 +4402,106 @@ define <4 x i32> @clmulr_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <2 x i32> @clmulr_v2i32_neon(<2 x i32> %a, <2 x i32> %b) nounwind {
 ; CHECK-NEON-LABEL: clmulr_v2i32_neon:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    mov w8, #2 // =0x2
-; CHECK-NEON-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEON-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEON-NEXT:    dup v2.2d, x8
-; CHECK-NEON-NEXT:    dup v3.2d, x9
-; CHECK-NEON-NEXT:    mov w8, #4 // =0x4
-; CHECK-NEON-NEXT:    mov w9, #8 // =0x8
-; CHECK-NEON-NEXT:    dup v4.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEON-NEXT:    dup v5.2d, x9
-; CHECK-NEON-NEXT:    dup v6.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEON-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEON-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEON-NEXT:    mov w9, #512 // =0x200
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    mov w8, #256 // =0x100
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    umull v2.2d, v0.2s, v2.2s
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    mov w8, #2048 // =0x800
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT:    xtn v4.2s, v16.2d
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4096 // =0x1000
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x9
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8192 // =0x2000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v3.2s, v5.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    mov w8, #16384 // =0x4000
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    dup v6.2d, x8
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v17.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    dup v23.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    eor v4.16b, v3.16b, v17.16b
-; CHECK-NEON-NEXT:    movi v23.4s, #128, lsl #24
-; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v20.16b
-; CHECK-NEON-NEXT:    xtn v5.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
-; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v21.2d
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
-; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    fneg v23.2d, v23.2d
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v16.16b
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    xtn v18.2s, v20.2d
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #134217728 // =0x8000000
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #268435456 // =0x10000000
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v24.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
-; CHECK-NEON-NEXT:    dup v25.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    dup v26.2d, x8
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v18.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v22.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v24.16b
-; CHECK-NEON-NEXT:    and v24.16b, v1.16b, v25.16b
-; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v26.16b
-; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v17.16b, v16.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v5.16b
-; CHECK-NEON-NEXT:    xtn v16.2s, v22.2d
-; CHECK-NEON-NEXT:    xtn v17.2s, v24.2d
-; CHECK-NEON-NEXT:    xtn v22.2s, v25.2d
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v21.16b
-; CHECK-NEON-NEXT:    eor v6.16b, v7.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v6.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #31
+; CHECK-NEON-NEXT:    rev32 v1.8b, v1.8b
+; CHECK-NEON-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT:    rbit v1.8b, v1.8b
+; CHECK-NEON-NEXT:    rbit v2.8b, v0.8b
+; CHECK-NEON-NEXT:    xtn v0.4h, v1.4s
+; CHECK-NEON-NEXT:    xtn v3.4h, v2.4s
+; CHECK-NEON-NEXT:    shrn v16.4h, v2.4s, #16
+; CHECK-NEON-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEON-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEON-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEON-NEXT:    rev16 v4.8b, v0.8b
+; CHECK-NEON-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEON-NEXT:    xtn v1.8b, v0.8h
+; CHECK-NEON-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEON-NEXT:    xtn v2.8b, v3.8h
+; CHECK-NEON-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEON-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEON-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEON-NEXT:    rbit v22.8b, v1.8b
+; CHECK-NEON-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEON-NEXT:    rbit v24.8b, v2.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEON-NEXT:    pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEON-NEXT:    pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEON-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEON-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEON-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEON-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEON-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEON-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEON-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEON-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEON-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEON-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEON-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEON-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEON-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEON-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEON-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEON-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEON-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEON-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEON-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEON-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEON-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEON-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEON-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v0.8b, v3.8b, v0.8b
+; CHECK-NEON-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEON-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEON-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEON-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEON-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT:    rbit v0.8b, v0.8b
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-AES-LABEL: clmulr_v2i32_neon:
 ; CHECK-AES:       // %bb.0:
+; CHECK-AES-NEXT:    rev32 v1.8b, v1.8b
+; CHECK-AES-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT:    rbit v1.8b, v1.8b
+; CHECK-AES-NEXT:    rbit v0.8b, v0.8b
 ; CHECK-AES-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-AES-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-AES-NEXT:    pmull2 v2.1q, v0.2d, v1.2d
 ; CHECK-AES-NEXT:    pmull v0.1q, v0.1d, v1.1d
 ; CHECK-AES-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-AES-NEXT:    shrn v0.2s, v0.2d, #31
+; CHECK-AES-NEXT:    xtn v0.2s, v0.2d
+; CHECK-AES-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT:    rbit v0.8b, v0.8b
 ; CHECK-AES-NEXT:    ret
   %a.ext = zext <2 x i32> %a to <2 x i64>
   %b.ext = zext <2 x i32> %b to <2 x i64>
@@ -5113,72 +4569,26 @@ define <8 x i16> @clmulh_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; CHECK-LABEL: clmulh_v8i16_neon:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev16 v1.16b, v1.16b
-; CHECK-NEXT:    rev16 v3.16b, v0.16b
-; CHECK-NEXT:    movi v2.8h, #2
-; CHECK-NEXT:    movi v4.8h, #1
-; CHECK-NEXT:    movi v5.8h, #4
-; CHECK-NEXT:    movi v6.8h, #8
-; CHECK-NEXT:    movi v7.8h, #16
-; CHECK-NEXT:    movi v16.8h, #32
-; CHECK-NEXT:    movi v17.8h, #128
-; CHECK-NEXT:    movi v18.8h, #1, lsl #8
-; CHECK-NEXT:    movi v19.8h, #8, lsl #8
-; CHECK-NEXT:    movi v20.8h, #16, lsl #8
-; CHECK-NEXT:    rbit v0.16b, v1.16b
-; CHECK-NEXT:    rbit v1.16b, v3.16b
-; CHECK-NEXT:    movi v3.8h, #64
-; CHECK-NEXT:    movi v21.8h, #2, lsl #8
-; CHECK-NEXT:    movi v22.8h, #32, lsl #8
-; CHECK-NEXT:    movi v23.8h, #4, lsl #8
-; CHECK-NEXT:    movi v24.8h, #64, lsl #8
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT:    mul v2.8h, v1.8h, v2.8h
-; CHECK-NEXT:    mul v4.8h, v1.8h, v4.8h
-; CHECK-NEXT:    mul v5.8h, v1.8h, v5.8h
-; CHECK-NEXT:    mul v6.8h, v1.8h, v6.8h
-; CHECK-NEXT:    mul v7.8h, v1.8h, v7.8h
-; CHECK-NEXT:    mul v16.8h, v1.8h, v16.8h
-; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT:    mul v17.8h, v1.8h, v17.8h
-; CHECK-NEXT:    mul v18.8h, v1.8h, v18.8h
-; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT:    mul v19.8h, v1.8h, v19.8h
-; CHECK-NEXT:    mul v20.8h, v1.8h, v20.8h
-; CHECK-NEXT:    and v22.16b, v0.16b, v22.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT:    movi v6.8h, #128, lsl #8
-; CHECK-NEXT:    mul v3.8h, v1.8h, v3.8h
-; CHECK-NEXT:    mul v5.8h, v1.8h, v21.8h
-; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT:    and v23.16b, v0.16b, v24.16b
-; CHECK-NEXT:    mul v22.8h, v1.8h, v22.8h
-; CHECK-NEXT:    eor v7.16b, v7.16b, v16.16b
-; CHECK-NEXT:    eor v16.16b, v17.16b, v18.16b
-; CHECK-NEXT:    eor v17.16b, v19.16b, v20.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v6.16b
-; CHECK-NEXT:    mul v4.8h, v1.8h, v21.8h
-; CHECK-NEXT:    mul v6.8h, v1.8h, v23.8h
-; CHECK-NEXT:    eor v3.16b, v7.16b, v3.16b
-; CHECK-NEXT:    eor v5.16b, v16.16b, v5.16b
-; CHECK-NEXT:    eor v7.16b, v17.16b, v22.16b
-; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v5.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v6.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-NEXT:    rbit v1.16b, v1.16b
+; CHECK-NEXT:    rbit v0.16b, v0.16b
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    rev16 v0.16b, v0.16b
 ; CHECK-NEXT:    rbit v0.16b, v0.16b
 ; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
@@ -5194,87 +4604,30 @@ define <8 x i16> @clmulh_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i16> @clmulh_v4i16_neon(<4 x i16> %a, <4 x i16> %b) nounwind {
 ; CHECK-LABEL: clmulh_v4i16_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #2
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    movi v4.4s, #4
-; CHECK-NEXT:    movi v5.4s, #8
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    movi v6.4s, #16
-; CHECK-NEXT:    movi v7.4s, #32
-; CHECK-NEXT:    movi v16.4s, #128
-; CHECK-NEXT:    movi v17.4s, #1, lsl #8
-; CHECK-NEXT:    movi v18.4s, #8, lsl #8
-; CHECK-NEXT:    movi v19.4s, #16, lsl #8
-; CHECK-NEXT:    movi v20.4s, #64
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    movi v21.4s, #2, lsl #8
-; CHECK-NEXT:    movi v22.4s, #32, lsl #8
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    xtn v4.4h, v4.4s
-; CHECK-NEXT:    xtn v5.4h, v5.4s
-; CHECK-NEXT:    movi v23.4s, #4, lsl #8
-; CHECK-NEXT:    movi v24.4s, #64, lsl #8
-; CHECK-NEXT:    xtn v6.4h, v6.4s
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    xtn v16.4h, v16.4s
-; CHECK-NEXT:    xtn v17.4h, v17.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    xtn v18.4h, v18.4s
-; CHECK-NEXT:    xtn v19.4h, v19.4s
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v5.4s, v0.4h, v5.4h
-; CHECK-NEXT:    movi v25.4s, #128, lsl #8
-; CHECK-NEXT:    xtn v20.4h, v20.4s
-; CHECK-NEXT:    xtn v21.4h, v21.4s
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    xtn v22.4h, v22.4s
-; CHECK-NEXT:    and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT:    umull v6.4s, v0.4h, v6.4h
-; CHECK-NEXT:    umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT:    umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT:    umull v17.4s, v0.4h, v17.4h
-; CHECK-NEXT:    umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT:    umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v25.16b
-; CHECK-NEXT:    xtn v4.4h, v23.4s
-; CHECK-NEXT:    xtn v5.4h, v24.4s
-; CHECK-NEXT:    umull v20.4s, v0.4h, v20.4h
-; CHECK-NEXT:    umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT:    umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    umull v3.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v5.4h
-; CHECK-NEXT:    eor v5.16b, v6.16b, v20.16b
-; CHECK-NEXT:    eor v6.16b, v7.16b, v21.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v22.16b
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v2.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v4.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-NEXT:    rev16 v1.8b, v1.8b
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    rbit v1.8b, v1.8b
+; CHECK-NEXT:    rbit v0.8b, v0.8b
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    rbit v0.8b, v0.8b
+; CHECK-NEXT:    ushr v0.4h, v0.4h, #1
 ; CHECK-NEXT:    ret
   %a.ext = zext <4 x i16> %a to <4 x i32>
   %b.ext = zext <4 x i16> %b to <4 x i32>
@@ -5288,136 +4641,87 @@ define <4 x i32> @clmulh_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; CHECK-LABEL: clmulh_v4i32_neon:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev32 v1.16b, v1.16b
-; CHECK-NEXT:    rev32 v2.16b, v0.16b
-; CHECK-NEXT:    movi v3.4s, #2
-; CHECK-NEXT:    movi v4.4s, #1
-; CHECK-NEXT:    movi v5.4s, #4
-; CHECK-NEXT:    movi v6.4s, #8
-; CHECK-NEXT:    movi v7.4s, #16
-; CHECK-NEXT:    movi v16.4s, #32
-; CHECK-NEXT:    movi v17.4s, #64
-; CHECK-NEXT:    movi v18.4s, #1, lsl #8
-; CHECK-NEXT:    movi v19.4s, #2, lsl #8
-; CHECK-NEXT:    movi v20.4s, #8, lsl #8
-; CHECK-NEXT:    rbit v0.16b, v1.16b
-; CHECK-NEXT:    rbit v1.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #128
-; CHECK-NEXT:    movi v21.4s, #16, lsl #8
-; CHECK-NEXT:    movi v22.4s, #8, lsl #16
-; CHECK-NEXT:    movi v23.4s, #2, lsl #24
-; CHECK-NEXT:    movi v25.4s, #4, lsl #24
-; CHECK-NEXT:    movi v24.4s, #32, lsl #16
-; CHECK-NEXT:    movi v26.4s, #8, lsl #24
-; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    mul v5.4s, v1.4s, v5.4s
-; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    mul v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    and v23.16b, v0.16b, v23.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT:    eor v6.16b, v7.16b, v16.16b
-; CHECK-NEXT:    mul v5.4s, v1.4s, v18.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v19.16b
-; CHECK-NEXT:    movi v18.4s, #32, lsl #8
-; CHECK-NEXT:    and v16.16b, v0.16b, v20.16b
-; CHECK-NEXT:    movi v19.4s, #1, lsl #16
-; CHECK-NEXT:    movi v20.4s, #4, lsl #8
-; CHECK-NEXT:    eor v3.16b, v3.16b, v4.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v21.16b
-; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT:    movi v17.4s, #2, lsl #16
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    eor v5.16b, v2.16b, v5.16b
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    movi v21.4s, #64, lsl #8
-; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    eor v2.16b, v3.16b, v6.16b
-; CHECK-NEXT:    and v3.16b, v0.16b, v19.16b
-; CHECK-NEXT:    movi v19.4s, #128, lsl #16
-; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v17.16b
-; CHECK-NEXT:    movi v17.4s, #64, lsl #16
-; CHECK-NEXT:    eor v5.16b, v5.16b, v7.16b
-; CHECK-NEXT:    mul v7.4s, v1.4s, v18.4s
-; CHECK-NEXT:    movi v18.4s, #4, lsl #16
-; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    eor v4.16b, v16.16b, v4.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v21.16b
-; CHECK-NEXT:    movi v21.4s, #1, lsl #24
-; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT:    mul v20.4s, v1.4s, v20.4s
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT:    movi v18.4s, #128, lsl #8
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    eor v5.16b, v5.16b, v20.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #16
-; CHECK-NEXT:    mul v21.4s, v1.4s, v21.4s
-; CHECK-NEXT:    eor v4.16b, v4.16b, v16.16b
-; CHECK-NEXT:    and v20.16b, v0.16b, v24.16b
-; CHECK-NEXT:    movi v24.4s, #64, lsl #24
-; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v16.16b, v17.16b, v19.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v18.4s, v1.4s, v23.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT:    and v19.16b, v0.16b, v25.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #24
-; CHECK-NEXT:    movi v23.4s, #32, lsl #24
-; CHECK-NEXT:    eor v16.16b, v16.16b, v21.16b
-; CHECK-NEXT:    and v21.16b, v0.16b, v26.16b
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    mul v5.4s, v1.4s, v20.4s
-; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v18.16b
-; CHECK-NEXT:    movi v16.4s, #128, lsl #24
-; CHECK-NEXT:    mul v18.4s, v1.4s, v21.4s
-; CHECK-NEXT:    and v20.16b, v0.16b, v22.16b
-; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT:    and v22.16b, v0.16b, v24.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v17.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    eor v6.16b, v6.16b, v19.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    mul v7.4s, v1.4s, v20.4s
-; CHECK-NEXT:    mul v16.4s, v1.4s, v21.4s
-; CHECK-NEXT:    mul v17.4s, v1.4s, v22.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v5.16b
-; CHECK-NEXT:    eor v4.16b, v6.16b, v18.16b
-; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev32 v0.16b, v0.16b
+; CHECK-NEXT:    rbit v1.16b, v1.16b
+; CHECK-NEXT:    rbit v2.16b, v0.16b
+; CHECK-NEXT:    xtn v0.4h, v1.4s
+; CHECK-NEXT:    xtn v3.4h, v2.4s
+; CHECK-NEXT:    shrn v16.4h, v2.4s, #16
+; CHECK-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEXT:    rev16 v4.8b, v0.8b
+; CHECK-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEXT:    xtn v1.8b, v0.8h
+; CHECK-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEXT:    xtn v2.8b, v3.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEXT:    rbit v22.8b, v1.8b
+; CHECK-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEXT:    rbit v24.8b, v2.8b
+; CHECK-NEXT:    pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEXT:    pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEXT:    pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEXT:    pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEXT:    pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEXT:    eor v0.8b, v0.8b, v3.8b
+; CHECK-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEXT:    pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEXT:    pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEXT:    pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v3.8b, v0.8b
+; CHECK-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    rev32 v0.16b, v0.16b
 ; CHECK-NEXT:    rbit v0.16b, v0.16b
 ; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
@@ -5433,209 +4737,108 @@ define <4 x i32> @clmulh_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <2 x i32> @clmulh_v2i32_neon(<2 x i32> %a, <2 x i32> %b) nounwind {
 ; CHECK-NEON-LABEL: clmulh_v2i32_neon:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    mov w8, #2 // =0x2
-; CHECK-NEON-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEON-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEON-NEXT:    dup v2.2d, x8
-; CHECK-NEON-NEXT:    dup v3.2d, x9
-; CHECK-NEON-NEXT:    mov w8, #4 // =0x4
-; CHECK-NEON-NEXT:    mov w9, #8 // =0x8
-; CHECK-NEON-NEXT:    dup v4.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEON-NEXT:    dup v5.2d, x9
-; CHECK-NEON-NEXT:    dup v6.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEON-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEON-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEON-NEXT:    mov w9, #512 // =0x200
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    mov w8, #256 // =0x100
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    umull v2.2d, v0.2s, v2.2s
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    mov w8, #2048 // =0x800
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT:    xtn v4.2s, v16.2d
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4096 // =0x1000
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x9
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8192 // =0x2000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v3.2s, v5.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    mov w8, #16384 // =0x4000
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    dup v6.2d, x8
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v17.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    dup v23.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    eor v4.16b, v3.16b, v17.16b
-; CHECK-NEON-NEXT:    movi v23.4s, #128, lsl #24
-; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v20.16b
-; CHECK-NEON-NEXT:    xtn v5.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
-; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v21.2d
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
-; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    fneg v23.2d, v23.2d
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v16.16b
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    xtn v18.2s, v20.2d
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #134217728 // =0x8000000
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #268435456 // =0x10000000
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v24.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
-; CHECK-NEON-NEXT:    dup v25.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    dup v26.2d, x8
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v18.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v22.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v24.16b
-; CHECK-NEON-NEXT:    and v24.16b, v1.16b, v25.16b
-; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v26.16b
-; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v17.16b, v16.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v5.16b
-; CHECK-NEON-NEXT:    xtn v16.2s, v22.2d
-; CHECK-NEON-NEXT:    xtn v17.2s, v24.2d
-; CHECK-NEON-NEXT:    xtn v22.2s, v25.2d
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v21.16b
-; CHECK-NEON-NEXT:    eor v6.16b, v7.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v6.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-NEON-NEXT:    rev32 v1.8b, v1.8b
+; CHECK-NEON-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT:    rbit v1.8b, v1.8b
+; CHECK-NEON-NEXT:    rbit v2.8b, v0.8b
+; CHECK-NEON-NEXT:    xtn v0.4h, v1.4s
+; CHECK-NEON-NEXT:    xtn v3.4h, v2.4s
+; CHECK-NEON-NEXT:    shrn v16.4h, v2.4s, #16
+; CHECK-NEON-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEON-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEON-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEON-NEXT:    rev16 v4.8b, v0.8b
+; CHECK-NEON-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEON-NEXT:    xtn v1.8b, v0.8h
+; CHECK-NEON-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEON-NEXT:    xtn v2.8b, v3.8h
+; CHECK-NEON-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEON-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEON-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEON-NEXT:    rbit v22.8b, v1.8b
+; CHECK-NEON-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEON-NEXT:    rbit v24.8b, v2.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEON-NEXT:    pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEON-NEXT:    pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEON-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEON-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEON-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEON-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEON-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEON-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEON-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEON-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEON-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEON-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEON-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEON-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEON-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEON-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEON-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEON-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEON-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEON-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEON-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEON-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEON-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEON-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEON-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v0.8b, v3.8b, v0.8b
+; CHECK-NEON-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEON-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEON-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEON-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEON-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT:    rbit v0.8b, v0.8b
+; CHECK-NEON-NEXT:    ushr v0.2s, v0.2s, #1
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-AES-LABEL: clmulh_v2i32_neon:
 ; CHECK-AES:       // %bb.0:
+; CHECK-AES-NEXT:    rev32 v1.8b, v1.8b
+; CHECK-AES-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT:    rbit v1.8b, v1.8b
+; CHECK-AES-NEXT:    rbit v0.8b, v0.8b
 ; CHECK-AES-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-AES-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-AES-NEXT:    pmull2 v2.1q, v0.2d, v1.2d
 ; CHECK-AES-NEXT:    pmull v0.1q, v0.1d, v1.1d
 ; CHECK-AES-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-AES-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-AES-NEXT:    xtn v0.2s, v0.2d
+; CHECK-AES-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT:    rbit v0.8b, v0.8b
+; CHECK-AES-NEXT:    ushr v0.2s, v0.2s, #1
 ; CHECK-AES-NEXT:    ret
   %a.ext = zext <2 x i32> %a to <2 x i64>
   %b.ext = zext <2 x i32> %b to <2 x i64>
diff --git a/llvm/test/CodeGen/PowerPC/clmul-vector.ll b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
index 5ab7e5ebbb4e5..1bebca731bc2d 100644
--- a/llvm/test/CodeGen/PowerPC/clmul-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
@@ -1765,671 +1765,571 @@ define <2 x i64> @clmul_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; LE-LABEL: clmul_v2i64:
 ; LE:       # %bb.0:
-; LE-NEXT:    vspltisw 4, 2
-; LE-NEXT:    vspltisw 0, 1
-; LE-NEXT:    vaddudm 8, 2, 2
-; LE-NEXT:    addis 3, 2, .LCPI3_0 at toc@ha
-; LE-NEXT:    vspltisw 6, 8
-; LE-NEXT:    addi 3, 3, .LCPI3_0 at toc@l
-; LE-NEXT:    vspltisw 5, 4
-; LE-NEXT:    vupklsw 7, 4
-; LE-NEXT:    vspltisw 4, 3
-; LE-NEXT:    vupklsw 1, 0
-; LE-NEXT:    vsld 9, 2, 7
-; LE-NEXT:    xxland 39, 35, 39
-; LE-NEXT:    vupklsw 6, 6
-; LE-NEXT:    vupklsw 5, 5
-; LE-NEXT:    xxland 42, 35, 37
-; LE-NEXT:    vupklsw 0, 4
-; LE-NEXT:    xxlxor 36, 36, 36
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    xxland 33, 35, 33
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    vcmpgtud 10, 10, 4
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 2, 39, 40
-; LE-NEXT:    xxland 39, 35, 38
-; LE-NEXT:    xxland 0, 42, 41
-; LE-NEXT:    xxland 10, 33, 34
-; LE-NEXT:    vsld 6, 2, 6
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    xxlxor 2, 10, 2
-; LE-NEXT:    xxland 1, 39, 32
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_1 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_1 at toc@l
-; LE-NEXT:    vspltisw 8, 5
-; LE-NEXT:    vspltisw 10, 7
-; LE-NEXT:    xxlxor 0, 2, 0
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    xxland 41, 35, 32
-; LE-NEXT:    vupklsw 8, 8
-; LE-NEXT:    vsld 8, 2, 8
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    vupklsw 10, 10
-; LE-NEXT:    xxland 3, 41, 37
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_2 at toc@ha
-; LE-NEXT:    vspltisw 7, 6
-; LE-NEXT:    addi 3, 3, .LCPI3_2 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 3
-; LE-NEXT:    lxvd2x 5, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_3 at toc@ha
-; LE-NEXT:    xxland 41, 35, 37
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_3 at toc@l
-; LE-NEXT:    vupklsw 7, 7
-; LE-NEXT:    vsld 7, 2, 7
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    lxvd2x 6, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_4 at toc@ha
-; LE-NEXT:    xxland 4, 41, 40
-; LE-NEXT:    xxland 40, 35, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_4 at toc@l
-; LE-NEXT:    vcmpgtud 8, 8, 4
-; LE-NEXT:    xxlxor 0, 0, 4
-; LE-NEXT:    lxvd2x 7, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_5 at toc@ha
-; LE-NEXT:    xxland 41, 35, 6
-; LE-NEXT:    xxland 5, 40, 39
-; LE-NEXT:    vsld 7, 2, 10
-; LE-NEXT:    addi 3, 3, .LCPI3_5 at toc@l
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    xxlxor 0, 0, 5
-; LE-NEXT:    lxvd2x 8, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_6 at toc@ha
-; LE-NEXT:    xxland 6, 41, 39
-; LE-NEXT:    xxland 41, 35, 7
-; LE-NEXT:    addi 3, 3, .LCPI3_6 at toc@l
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    xxlxor 0, 0, 6
-; LE-NEXT:    lxvd2x 9, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_7 at toc@ha
-; LE-NEXT:    xxland 7, 41, 38
-; LE-NEXT:    addi 3, 3, .LCPI3_7 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 7
-; LE-NEXT:    lxvd2x 10, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_8 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_8 at toc@l
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_9 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_9 at toc@l
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_10 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_10 at toc@l
-; LE-NEXT:    vspltisw 8, 9
-; LE-NEXT:    vupklsw 7, 8
-; LE-NEXT:    vsld 6, 2, 7
-; LE-NEXT:    xxland 39, 35, 8
-; LE-NEXT:    vspltisw 8, 10
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    xxland 8, 39, 38
-; LE-NEXT:    xxland 39, 35, 9
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    vupklsw 8, 8
-; LE-NEXT:    vsld 6, 2, 8
-; LE-NEXT:    xxland 9, 39, 38
-; LE-NEXT:    xxland 39, 35, 10
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    xxlxor 0, 0, 8
-; LE-NEXT:    xxlxor 0, 0, 9
-; LE-NEXT:    vspltisw 9, 11
-; LE-NEXT:    vupklsw 8, 9
-; LE-NEXT:    vsld 1, 2, 8
-; LE-NEXT:    xxland 10, 39, 33
-; LE-NEXT:    xxland 39, 35, 1
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    xxlxor 0, 0, 10
-; LE-NEXT:    vspltisw 6, 12
-; LE-NEXT:    vupklsw 6, 6
-; LE-NEXT:    vsld 6, 2, 6
-; LE-NEXT:    xxland 1, 39, 38
-; LE-NEXT:    xxland 39, 35, 2
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    addis 3, 2, .LCPI3_11 at toc@ha
-; LE-NEXT:    vspltisw 1, 13
-; LE-NEXT:    addi 3, 3, .LCPI3_11 at toc@l
-; LE-NEXT:    vupklsw 1, 1
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    xxland 2, 39, 33
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_12 at toc@ha
-; LE-NEXT:    xxland 39, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_12 at toc@l
-; LE-NEXT:    vspltisw 6, 14
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    vupklsw 6, 6
-; LE-NEXT:    vsld 6, 2, 6
-; LE-NEXT:    xxland 1, 39, 38
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_13 at toc@ha
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    vspltisw 1, 15
-; LE-NEXT:    addi 3, 3, .LCPI3_13 at toc@l
-; LE-NEXT:    vupklsw 1, 1
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    xxland 2, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_14 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_14 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_15 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_15 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_16 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_16 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_17 at toc@ha
-; LE-NEXT:    xxland 2, 38, 33
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_17 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_18 at toc@ha
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_18 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_19 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_19 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_20 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_20 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_21 at toc@ha
-; LE-NEXT:    xxland 2, 38, 33
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_21 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_22 at toc@ha
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_22 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_23 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_23 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_24 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_24 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_25 at toc@ha
-; LE-NEXT:    xxland 2, 38, 33
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_25 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_26 at toc@ha
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_26 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_27 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_27 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_28 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_28 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_29 at toc@ha
-; LE-NEXT:    xxland 2, 38, 33
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_29 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_30 at toc@ha
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_30 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_31 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_31 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_32 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_32 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_33 at toc@ha
-; LE-NEXT:    xxland 2, 38, 33
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_33 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_34 at toc@ha
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_34 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_35 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_35 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_36 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_36 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_37 at toc@ha
-; LE-NEXT:    xxland 2, 38, 33
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_37 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_38 at toc@ha
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_38 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_39 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_39 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_40 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_40 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_41 at toc@ha
-; LE-NEXT:    xxland 2, 38, 33
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_41 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_42 at toc@ha
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_42 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_43 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_43 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_44 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_44 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_45 at toc@ha
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    addi 3, 3, .LCPI3_45 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_46 at toc@ha
-; LE-NEXT:    xxland 1, 38, 37
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_46 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_47 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_47 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_48 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_48 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_49 at toc@ha
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    addi 3, 3, .LCPI3_49 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_50 at toc@ha
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_50 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_51 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_51 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_52 at toc@ha
-; LE-NEXT:    xxland 1, 38, 37
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_52 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_53 at toc@ha
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    addi 3, 3, .LCPI3_53 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_54 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_54 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_55 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_55 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_56 at toc@ha
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_56 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_57 at toc@ha
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    addi 3, 3, .LCPI3_57 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_58 at toc@ha
-; LE-NEXT:    xxland 1, 38, 37
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_58 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_59 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_59 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_60 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_60 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_61 at toc@ha
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    addi 3, 3, .LCPI3_61 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_62 at toc@ha
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_62 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_63 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_63 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_64 at toc@ha
-; LE-NEXT:    xxland 1, 38, 37
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_64 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_65 at toc@ha
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    addi 3, 3, .LCPI3_65 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_66 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_66 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_67 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_67 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_68 at toc@ha
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_68 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_69 at toc@ha
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    addi 3, 3, .LCPI3_69 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_70 at toc@ha
-; LE-NEXT:    xxland 1, 38, 37
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_70 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_71 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_71 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_72 at toc@ha
-; LE-NEXT:    xxland 1, 38, 32
-; LE-NEXT:    vsld 1, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_72 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_73 at toc@ha
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    addi 3, 3, .LCPI3_73 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_74 at toc@ha
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_74 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_75 at toc@ha
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_75 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 3, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_76 at toc@ha
-; LE-NEXT:    xxland 1, 38, 37
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_76 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 33, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_77 at toc@ha
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    addi 3, 3, .LCPI3_77 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_78 at toc@ha
-; LE-NEXT:    xxland 2, 38, 32
-; LE-NEXT:    vsld 0, 2, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_78 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_79 at toc@ha
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_79 at toc@l
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_80 at toc@ha
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_80 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_81 at toc@ha
-; LE-NEXT:    xxland 33, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_81 at toc@l
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_82 at toc@ha
-; LE-NEXT:    xxland 2, 33, 37
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    addi 3, 3, .LCPI3_82 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_83 at toc@ha
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_83 at toc@l
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    lxvd2x 2, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_84 at toc@ha
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_84 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_85 at toc@ha
-; LE-NEXT:    xxland 33, 35, 2
-; LE-NEXT:    addi 3, 3, .LCPI3_85 at toc@l
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 37
-; LE-NEXT:    vsld 0, 2, 0
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_86 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_86 at toc@l
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_87 at toc@ha
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_87 at toc@l
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_88 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_88 at toc@l
-; LE-NEXT:    lxvd2x 32, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_89 at toc@ha
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    addi 3, 3, .LCPI3_89 at toc@l
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 37
-; LE-NEXT:    vsld 5, 2, 0
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_90 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_90 at toc@l
-; LE-NEXT:    xxland 32, 35, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_91 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_91 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_92 at toc@ha
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_92 at toc@l
-; LE-NEXT:    xxland 32, 35, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_93 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_93 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_94 at toc@ha
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_94 at toc@l
-; LE-NEXT:    xxland 32, 35, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_95 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_95 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_96 at toc@ha
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_96 at toc@l
-; LE-NEXT:    xxland 32, 35, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_97 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_97 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_98 at toc@ha
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_98 at toc@l
-; LE-NEXT:    xxland 32, 35, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_99 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_99 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_100 at toc@ha
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_100 at toc@l
-; LE-NEXT:    xxland 32, 35, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_101 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_101 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_102 at toc@ha
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_102 at toc@l
-; LE-NEXT:    xxland 32, 35, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_103 at toc@ha
-; LE-NEXT:    addi 3, 3, .LCPI3_103 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    addis 3, 2, .LCPI3_104 at toc@ha
-; LE-NEXT:    vsld 5, 2, 5
-; LE-NEXT:    addi 3, 3, .LCPI3_104 at toc@l
-; LE-NEXT:    xxland 32, 35, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    xxleqv 37, 37, 37
-; LE-NEXT:    vsld 2, 2, 5
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 3
-; LE-NEXT:    xxland 35, 35, 1
-; LE-NEXT:    vcmpgtud 3, 3, 4
-; LE-NEXT:    xxland 1, 35, 34
-; LE-NEXT:    xxlxor 34, 0, 1
+; LE-NEXT:    stdu 1, -480(1)
+; LE-NEXT:    mfvsrd 4, 35
+; LE-NEXT:    mfvsrd 3, 34
+; LE-NEXT:    std 16, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    std 14, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    std 15, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    std 17, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    xxswapd 0, 35
+; LE-NEXT:    xxswapd 1, 34
+; LE-NEXT:    std 18, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    std 30, 464(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 30, 30
+; LE-NEXT:    clrldi 6, 4, 63
+; LE-NEXT:    rlwinm 7, 4, 0, 29, 29
+; LE-NEXT:    rlwinm 8, 4, 0, 28, 28
+; LE-NEXT:    rlwinm 9, 4, 0, 27, 27
+; LE-NEXT:    rlwinm 10, 4, 0, 26, 26
+; LE-NEXT:    rlwinm 11, 4, 0, 25, 25
+; LE-NEXT:    rlwinm 12, 4, 0, 24, 24
+; LE-NEXT:    rlwinm 0, 4, 0, 23, 23
+; LE-NEXT:    rlwinm 30, 4, 0, 22, 22
+; LE-NEXT:    std 19, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 456(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 29, 4, 0, 21, 21
+; LE-NEXT:    std 20, 384(1) # 8-byte Folded Spill
+; LE-NEXT:    std 28, 448(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 28, 4, 0, 20, 20
+; LE-NEXT:    std 21, 392(1) # 8-byte Folded Spill
+; LE-NEXT:    std 22, 400(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    mulld 7, 3, 7
+; LE-NEXT:    mulld 8, 3, 8
+; LE-NEXT:    std 27, 440(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 27, 4, 0, 19, 19
+; LE-NEXT:    std 23, 408(1) # 8-byte Folded Spill
+; LE-NEXT:    std 26, 432(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 26, 4, 0, 18, 18
+; LE-NEXT:    std 24, 416(1) # 8-byte Folded Spill
+; LE-NEXT:    std 25, 424(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 25, 4, 0, 17, 17
+; LE-NEXT:    std 31, 472(1) # 8-byte Folded Spill
+; LE-NEXT:    std 2, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    xor 5, 6, 5
+; LE-NEXT:    std 8, 64(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 9
+; LE-NEXT:    xor 16, 5, 7
+; LE-NEXT:    rlwinm 5, 4, 0, 16, 16
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 80(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 10
+; LE-NEXT:    std 5, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 15, 15
+; LE-NEXT:    std 8, 96(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 11
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 112(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 12
+; LE-NEXT:    std 5, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 14, 14
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 128(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 0
+; LE-NEXT:    std 5, 288(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 13, 13
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 144(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 30
+; LE-NEXT:    std 5, 304(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 12, 12
+; LE-NEXT:    std 8, 160(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 29
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 184(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 28
+; LE-NEXT:    std 5, 312(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 11, 11
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 200(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 27
+; LE-NEXT:    std 5, 320(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 10, 10
+; LE-NEXT:    std 8, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 26
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 8, 3, 25
+; LE-NEXT:    std 5, 296(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 9, 9
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 8, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    std 5, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 8, 8
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 7, 7
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 6, 6
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 216(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 5, 5
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 208(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 4, 4
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 192(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 3, 3
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 176(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 2, 2
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 168(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 1, 1
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 152(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 5, 4, 0, 0, 0
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 136(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 32, 32
+; LE-NEXT:    rldicl 5, 5, 32, 31
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 120(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 31, 33
+; LE-NEXT:    rldicl 5, 5, 33, 30
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 104(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 30, 34
+; LE-NEXT:    rldicl 5, 5, 34, 29
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 88(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 29, 35
+; LE-NEXT:    rldicl 5, 5, 35, 28
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 72(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 28, 36
+; LE-NEXT:    rldicl 5, 5, 36, 27
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 56(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 27, 37
+; LE-NEXT:    rldicl 5, 5, 37, 26
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 48(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 26, 38
+; LE-NEXT:    rldicl 5, 5, 38, 25
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    std 5, 40(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 5, 4, 25, 39
+; LE-NEXT:    rldicl 5, 5, 39, 24
+; LE-NEXT:    mulld 14, 3, 5
+; LE-NEXT:    rldicl 5, 4, 24, 40
+; LE-NEXT:    rldicl 5, 5, 40, 23
+; LE-NEXT:    mulld 15, 3, 5
+; LE-NEXT:    rldicl 5, 4, 23, 41
+; LE-NEXT:    rldicl 5, 5, 41, 22
+; LE-NEXT:    mulld 17, 3, 5
+; LE-NEXT:    rldicl 5, 4, 22, 42
+; LE-NEXT:    rldicl 5, 5, 42, 21
+; LE-NEXT:    mulld 18, 3, 5
+; LE-NEXT:    rldicl 5, 4, 21, 43
+; LE-NEXT:    rldicl 5, 5, 43, 20
+; LE-NEXT:    mulld 19, 3, 5
+; LE-NEXT:    rldicl 5, 4, 20, 44
+; LE-NEXT:    rldicl 5, 5, 44, 19
+; LE-NEXT:    mulld 20, 3, 5
+; LE-NEXT:    rldicl 5, 4, 19, 45
+; LE-NEXT:    rldicl 5, 5, 45, 18
+; LE-NEXT:    mulld 21, 3, 5
+; LE-NEXT:    rldicl 5, 4, 18, 46
+; LE-NEXT:    rldicl 5, 5, 46, 17
+; LE-NEXT:    mulld 22, 3, 5
+; LE-NEXT:    rldicl 5, 4, 17, 47
+; LE-NEXT:    rldicl 5, 5, 47, 16
+; LE-NEXT:    mulld 23, 3, 5
+; LE-NEXT:    rldicl 5, 4, 16, 48
+; LE-NEXT:    rldicl 5, 5, 48, 15
+; LE-NEXT:    mulld 24, 3, 5
+; LE-NEXT:    rldicl 5, 4, 15, 49
+; LE-NEXT:    rldicl 5, 5, 49, 14
+; LE-NEXT:    mulld 25, 3, 5
+; LE-NEXT:    rldicl 5, 4, 14, 50
+; LE-NEXT:    rldicl 5, 5, 50, 13
+; LE-NEXT:    mulld 26, 3, 5
+; LE-NEXT:    rldicl 5, 4, 13, 51
+; LE-NEXT:    rldicl 5, 5, 51, 12
+; LE-NEXT:    mulld 27, 3, 5
+; LE-NEXT:    rldicl 5, 4, 12, 52
+; LE-NEXT:    rldicl 5, 5, 52, 11
+; LE-NEXT:    mulld 28, 3, 5
+; LE-NEXT:    rldicl 5, 4, 11, 53
+; LE-NEXT:    rldicl 5, 5, 53, 10
+; LE-NEXT:    mulld 29, 3, 5
+; LE-NEXT:    rldicl 5, 4, 10, 54
+; LE-NEXT:    rldicl 5, 5, 54, 9
+; LE-NEXT:    mulld 30, 3, 5
+; LE-NEXT:    rldicl 5, 4, 9, 55
+; LE-NEXT:    rldicl 5, 5, 55, 8
+; LE-NEXT:    mulld 0, 3, 5
+; LE-NEXT:    rldicl 5, 4, 8, 56
+; LE-NEXT:    rldicl 5, 5, 56, 7
+; LE-NEXT:    mulld 12, 3, 5
+; LE-NEXT:    rldicl 5, 4, 7, 57
+; LE-NEXT:    rldicl 5, 5, 57, 6
+; LE-NEXT:    mulld 11, 3, 5
+; LE-NEXT:    rldicl 5, 4, 6, 58
+; LE-NEXT:    rldicl 5, 5, 58, 5
+; LE-NEXT:    mulld 10, 3, 5
+; LE-NEXT:    rldicl 5, 4, 5, 59
+; LE-NEXT:    rldicl 5, 5, 59, 4
+; LE-NEXT:    mulld 9, 3, 5
+; LE-NEXT:    rldicl 5, 4, 4, 60
+; LE-NEXT:    rldicl 5, 5, 60, 3
+; LE-NEXT:    mulld 8, 3, 5
+; LE-NEXT:    rldicl 5, 4, 3, 61
+; LE-NEXT:    rldicl 5, 5, 61, 2
+; LE-NEXT:    mulld 7, 3, 5
+; LE-NEXT:    rldicl 5, 4, 2, 62
+; LE-NEXT:    rldicr 4, 4, 0, 0
+; LE-NEXT:    rldicl 5, 5, 62, 1
+; LE-NEXT:    mulld 6, 3, 5
+; LE-NEXT:    mulld 5, 3, 4
+; LE-NEXT:    mffprd 4, 0
+; LE-NEXT:    mffprd 3, 1
+; LE-NEXT:    rlwinm 2, 4, 0, 30, 30
+; LE-NEXT:    clrldi 31, 4, 63
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    mulld 31, 3, 31
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 64(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 29, 29
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 80(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 28, 28
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 96(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 27, 27
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 112(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 26, 26
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 128(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 25, 25
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 144(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 24, 24
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 160(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 23, 23
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 184(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 22, 22
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 200(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 21, 21
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 224(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 20, 20
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 19, 19
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 280(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 18, 18
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 17, 17
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 272(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 16, 16
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 288(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 15, 15
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 304(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 14, 14
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 312(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    rlwinm 2, 4, 0, 13, 13
+; LE-NEXT:    mulld 2, 3, 2
+; LE-NEXT:    xor 31, 31, 2
+; LE-NEXT:    ld 2, 320(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 296(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 264(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 216(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 208(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 192(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 176(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 168(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 152(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 136(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 120(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 104(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 88(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 72(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 56(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 48(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 40(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 2
+; LE-NEXT:    ld 2, 328(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 14
+; LE-NEXT:    ld 14, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 16, 16, 15
+; LE-NEXT:    ld 15, 344(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 17, 16, 17
+; LE-NEXT:    ld 16, 352(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 18, 17, 18
+; LE-NEXT:    ld 17, 360(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 19, 18, 19
+; LE-NEXT:    ld 18, 368(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 20, 19, 20
+; LE-NEXT:    ld 19, 376(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 21, 20, 21
+; LE-NEXT:    ld 20, 384(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 22, 21, 22
+; LE-NEXT:    ld 21, 392(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 23, 22, 23
+; LE-NEXT:    ld 22, 400(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 24, 23, 24
+; LE-NEXT:    ld 23, 408(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 25, 24, 25
+; LE-NEXT:    ld 24, 416(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 26, 25, 26
+; LE-NEXT:    ld 25, 424(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 27, 26, 27
+; LE-NEXT:    ld 26, 432(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 28, 27, 28
+; LE-NEXT:    ld 27, 440(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 29, 28, 29
+; LE-NEXT:    ld 28, 448(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 30, 29, 30
+; LE-NEXT:    ld 29, 456(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 0, 30, 0
+; LE-NEXT:    ld 30, 464(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 12, 0, 12
+; LE-NEXT:    xor 11, 12, 11
+; LE-NEXT:    xor 10, 11, 10
+; LE-NEXT:    xor 9, 10, 9
+; LE-NEXT:    xor 8, 9, 8
+; LE-NEXT:    xor 7, 8, 7
+; LE-NEXT:    xor 6, 7, 6
+; LE-NEXT:    xor 5, 6, 5
+; LE-NEXT:    rlwinm 6, 4, 0, 11, 11
+; LE-NEXT:    mtfprd 0, 5
+; LE-NEXT:    rlwinm 5, 4, 0, 12, 12
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    mulld 5, 3, 5
+; LE-NEXT:    xor 5, 31, 5
+; LE-NEXT:    ld 31, 472(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 10, 10
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 9, 9
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 8, 8
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 7, 7
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 6, 6
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 5, 5
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 4, 4
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 3, 3
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 2, 2
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 1, 1
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rlwinm 6, 4, 0, 0, 0
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 32, 32
+; LE-NEXT:    rldicl 6, 6, 32, 31
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 31, 33
+; LE-NEXT:    rldicl 6, 6, 33, 30
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 30, 34
+; LE-NEXT:    rldicl 6, 6, 34, 29
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 29, 35
+; LE-NEXT:    rldicl 6, 6, 35, 28
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 28, 36
+; LE-NEXT:    rldicl 6, 6, 36, 27
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 27, 37
+; LE-NEXT:    rldicl 6, 6, 37, 26
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 26, 38
+; LE-NEXT:    rldicl 6, 6, 38, 25
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 25, 39
+; LE-NEXT:    rldicl 6, 6, 39, 24
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 24, 40
+; LE-NEXT:    rldicl 6, 6, 40, 23
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 23, 41
+; LE-NEXT:    rldicl 6, 6, 41, 22
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 22, 42
+; LE-NEXT:    rldicl 6, 6, 42, 21
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 21, 43
+; LE-NEXT:    rldicl 6, 6, 43, 20
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 20, 44
+; LE-NEXT:    rldicl 6, 6, 44, 19
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 19, 45
+; LE-NEXT:    rldicl 6, 6, 45, 18
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 18, 46
+; LE-NEXT:    rldicl 6, 6, 46, 17
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 17, 47
+; LE-NEXT:    rldicl 6, 6, 47, 16
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 16, 48
+; LE-NEXT:    rldicl 6, 6, 48, 15
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 15, 49
+; LE-NEXT:    rldicl 6, 6, 49, 14
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 14, 50
+; LE-NEXT:    rldicl 6, 6, 50, 13
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 13, 51
+; LE-NEXT:    rldicl 6, 6, 51, 12
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 12, 52
+; LE-NEXT:    rldicl 6, 6, 52, 11
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 11, 53
+; LE-NEXT:    rldicl 6, 6, 53, 10
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 10, 54
+; LE-NEXT:    rldicl 6, 6, 54, 9
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 9, 55
+; LE-NEXT:    rldicl 6, 6, 55, 8
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 8, 56
+; LE-NEXT:    rldicl 6, 6, 56, 7
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 7, 57
+; LE-NEXT:    rldicl 6, 6, 57, 6
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 6, 58
+; LE-NEXT:    rldicl 6, 6, 58, 5
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 5, 59
+; LE-NEXT:    rldicl 6, 6, 59, 4
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 4, 60
+; LE-NEXT:    rldicl 6, 6, 60, 3
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 3, 61
+; LE-NEXT:    rldicl 6, 6, 61, 2
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    rldicl 6, 4, 2, 62
+; LE-NEXT:    rldicr 4, 4, 0, 0
+; LE-NEXT:    rldicl 6, 6, 62, 1
+; LE-NEXT:    mulld 6, 3, 6
+; LE-NEXT:    mulld 3, 3, 4
+; LE-NEXT:    xor 5, 5, 6
+; LE-NEXT:    xor 3, 5, 3
+; LE-NEXT:    mtfprd 1, 3
+; LE-NEXT:    xxmrghd 34, 0, 1
+; LE-NEXT:    addi 1, 1, 480
 ; LE-NEXT:    blr
   %res = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %a, <2 x i64> %b)
   ret <2 x i64> %res
@@ -5227,851 +5127,840 @@ define <2 x i64> @clmulr_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; LE-LABEL: clmulr_v2i64:
 ; LE:       # %bb.0:
-; LE-NEXT:    lis 3, -21846
-; LE-NEXT:    lis 4, 21845
-; LE-NEXT:    lis 5, -13108
-; LE-NEXT:    lis 6, 13107
-; LE-NEXT:    xxswapd 0, 34
-; LE-NEXT:    lis 7, -3856
-; LE-NEXT:    lis 8, 3855
+; LE-NEXT:    stdu 1, -752(1)
+; LE-NEXT:    lis 4, -21846
+; LE-NEXT:    lis 5, 21845
 ; LE-NEXT:    xxswapd 1, 35
-; LE-NEXT:    ori 3, 3, 43690
-; LE-NEXT:    ori 4, 4, 21845
-; LE-NEXT:    ori 5, 5, 52428
-; LE-NEXT:    ori 6, 6, 13107
-; LE-NEXT:    mffprd 9, 0
-; LE-NEXT:    ori 7, 7, 61680
-; LE-NEXT:    ori 8, 8, 3855
-; LE-NEXT:    mffprd 10, 1
-; LE-NEXT:    sldi 3, 3, 32
+; LE-NEXT:    xxswapd 0, 34
+; LE-NEXT:    mfvsrd 3, 35
+; LE-NEXT:    mfvsrd 9, 34
+; LE-NEXT:    lis 6, -13108
+; LE-NEXT:    lis 7, 13107
+; LE-NEXT:    ori 4, 4, 43690
+; LE-NEXT:    ori 5, 5, 21845
+; LE-NEXT:    mffprd 8, 1
+; LE-NEXT:    mffprd 10, 0
+; LE-NEXT:    std 28, 720(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 728(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 6, 6, 52428
+; LE-NEXT:    ori 7, 7, 13107
 ; LE-NEXT:    sldi 4, 4, 32
 ; LE-NEXT:    sldi 5, 5, 32
 ; LE-NEXT:    sldi 6, 6, 32
 ; LE-NEXT:    sldi 7, 7, 32
-; LE-NEXT:    sldi 8, 8, 32
-; LE-NEXT:    vspltisw 4, 4
-; LE-NEXT:    vspltisw 0, 8
-; LE-NEXT:    oris 3, 3, 43690
-; LE-NEXT:    oris 4, 4, 21845
-; LE-NEXT:    vspltisw 7, 1
-; LE-NEXT:    vspltisw 10, 7
-; LE-NEXT:    oris 5, 5, 52428
-; LE-NEXT:    oris 11, 6, 13107
-; LE-NEXT:    vupklsw 1, 4
-; LE-NEXT:    vspltisw 4, 3
-; LE-NEXT:    oris 12, 7, 61680
-; LE-NEXT:    oris 0, 8, 3855
-; LE-NEXT:    vupklsw 6, 0
-; LE-NEXT:    vupklsw 10, 10
-; LE-NEXT:    ori 8, 3, 43690
-; LE-NEXT:    ori 7, 4, 21845
-; LE-NEXT:    vupklsw 8, 4
-; LE-NEXT:    xxlxor 36, 36, 36
-; LE-NEXT:    ori 6, 5, 52428
-; LE-NEXT:    ori 5, 11, 13107
-; LE-NEXT:    sldi 11, 9, 1
-; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    ori 4, 12, 61680
+; LE-NEXT:    sldi 11, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    std 30, 736(1) # 8-byte Folded Spill
+; LE-NEXT:    lis 0, -3856
+; LE-NEXT:    oris 4, 4, 43690
+; LE-NEXT:    oris 5, 5, 21845
+; LE-NEXT:    lis 30, 3855
+; LE-NEXT:    oris 6, 6, 52428
 ; LE-NEXT:    sldi 12, 10, 1
 ; LE-NEXT:    rldicl 10, 10, 63, 1
-; LE-NEXT:    and 11, 11, 8
-; LE-NEXT:    and 9, 9, 7
-; LE-NEXT:    and 12, 12, 8
-; LE-NEXT:    and 10, 10, 7
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    or 10, 10, 12
-; LE-NEXT:    ori 3, 0, 3855
-; LE-NEXT:    sldi 11, 9, 2
-; LE-NEXT:    rldicl 9, 9, 62, 2
-; LE-NEXT:    sldi 12, 10, 2
-; LE-NEXT:    rldicl 10, 10, 62, 2
-; LE-NEXT:    and 11, 11, 6
-; LE-NEXT:    and 9, 9, 5
-; LE-NEXT:    and 12, 12, 6
-; LE-NEXT:    and 10, 10, 5
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    or 10, 10, 12
-; LE-NEXT:    sldi 11, 9, 4
-; LE-NEXT:    rldicl 9, 9, 60, 4
-; LE-NEXT:    sldi 12, 10, 4
-; LE-NEXT:    rldicl 10, 10, 60, 4
-; LE-NEXT:    and 11, 11, 4
-; LE-NEXT:    and 9, 9, 3
-; LE-NEXT:    and 12, 12, 4
-; LE-NEXT:    and 10, 10, 3
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    or 10, 10, 12
-; LE-NEXT:    rotlwi 12, 9, 24
-; LE-NEXT:    rldicl 11, 9, 32, 32
-; LE-NEXT:    rldicl 0, 10, 32, 32
-; LE-NEXT:    rlwimi 12, 9, 8, 8, 15
-; LE-NEXT:    rlwimi 12, 9, 8, 24, 31
-; LE-NEXT:    rotlwi 9, 10, 24
-; LE-NEXT:    sldi 12, 12, 32
-; LE-NEXT:    rlwimi 9, 10, 8, 8, 15
-; LE-NEXT:    rlwimi 9, 10, 8, 24, 31
-; LE-NEXT:    rotlwi 10, 11, 24
-; LE-NEXT:    sldi 9, 9, 32
-; LE-NEXT:    rlwimi 10, 11, 8, 8, 15
-; LE-NEXT:    rlwimi 10, 11, 8, 24, 31
-; LE-NEXT:    rotlwi 11, 0, 24
-; LE-NEXT:    or 10, 12, 10
-; LE-NEXT:    rlwimi 11, 0, 8, 8, 15
-; LE-NEXT:    rlwimi 11, 0, 8, 24, 31
-; LE-NEXT:    mtfprd 0, 10
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    mtfprd 1, 9
-; LE-NEXT:    mfvsrd 9, 35
-; LE-NEXT:    sldi 10, 9, 1
+; LE-NEXT:    oris 7, 7, 13107
+; LE-NEXT:    std 27, 712(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 28, 4, 43690
+; LE-NEXT:    ori 29, 5, 21845
+; LE-NEXT:    std 14, 608(1) # 8-byte Folded Spill
+; LE-NEXT:    std 15, 616(1) # 8-byte Folded Spill
+; LE-NEXT:    sldi 4, 8, 1
+; LE-NEXT:    rldicl 5, 8, 63, 1
+; LE-NEXT:    std 16, 624(1) # 8-byte Folded Spill
+; LE-NEXT:    std 17, 632(1) # 8-byte Folded Spill
+; LE-NEXT:    sldi 8, 9, 1
 ; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    and 10, 10, 8
-; LE-NEXT:    and 9, 9, 7
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 2
-; LE-NEXT:    rldicl 9, 9, 62, 2
-; LE-NEXT:    and 10, 10, 6
-; LE-NEXT:    and 9, 9, 5
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 4
-; LE-NEXT:    rldicl 9, 9, 60, 4
-; LE-NEXT:    and 10, 10, 4
-; LE-NEXT:    and 9, 9, 3
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    rldicl 10, 9, 32, 32
-; LE-NEXT:    rotlwi 11, 10, 24
-; LE-NEXT:    rlwimi 11, 10, 8, 8, 15
-; LE-NEXT:    rlwimi 11, 10, 8, 24, 31
-; LE-NEXT:    rotlwi 10, 9, 24
-; LE-NEXT:    rlwimi 10, 9, 8, 8, 15
-; LE-NEXT:    rlwimi 10, 9, 8, 24, 31
-; LE-NEXT:    mfvsrd 9, 34
-; LE-NEXT:    sldi 10, 10, 32
-; LE-NEXT:    or 10, 10, 11
-; LE-NEXT:    mtfprd 2, 10
-; LE-NEXT:    sldi 10, 9, 1
-; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    and 10, 10, 8
-; LE-NEXT:    and 9, 9, 7
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 2
-; LE-NEXT:    rldicl 9, 9, 62, 2
-; LE-NEXT:    and 10, 10, 6
-; LE-NEXT:    and 9, 9, 5
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 4
-; LE-NEXT:    rldicl 9, 9, 60, 4
-; LE-NEXT:    and 10, 10, 4
-; LE-NEXT:    and 9, 9, 3
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    rldicl 10, 9, 32, 32
-; LE-NEXT:    rotlwi 11, 10, 24
-; LE-NEXT:    rlwimi 11, 10, 8, 8, 15
-; LE-NEXT:    rlwimi 11, 10, 8, 24, 31
-; LE-NEXT:    rotlwi 10, 9, 24
-; LE-NEXT:    rlwimi 10, 9, 8, 8, 15
-; LE-NEXT:    rlwimi 10, 9, 8, 24, 31
-; LE-NEXT:    sldi 9, 10, 32
-; LE-NEXT:    addis 10, 2, .LCPI7_14 at toc@ha
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    vspltisw 3, 2
-; LE-NEXT:    vupklsw 5, 3
-; LE-NEXT:    xxmrghd 34, 2, 1
-; LE-NEXT:    xxland 41, 34, 37
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    mtfprd 1, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_0 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_0 at toc@l
-; LE-NEXT:    xxmrghd 35, 1, 0
-; LE-NEXT:    vaddudm 0, 3, 3
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    xxland 0, 41, 32
-; LE-NEXT:    vupklsw 0, 7
-; LE-NEXT:    xxland 39, 34, 33
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    xxland 32, 34, 32
-; LE-NEXT:    xxland 1, 39, 37
-; LE-NEXT:    xxland 39, 34, 38
-; LE-NEXT:    vsld 5, 3, 8
-; LE-NEXT:    vsld 6, 3, 6
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    xxland 13, 32, 35
-; LE-NEXT:    xxland 2, 39, 37
-; LE-NEXT:    vsld 5, 3, 1
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_1 at toc@ha
-; LE-NEXT:    xxlxor 0, 13, 0
-; LE-NEXT:    vspltisw 8, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_1 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    xxland 41, 34, 33
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    xxland 5, 41, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_2 at toc@ha
-; LE-NEXT:    vupklsw 8, 8
-; LE-NEXT:    vsld 8, 3, 8
-; LE-NEXT:    addi 9, 9, .LCPI7_2 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 5
-; LE-NEXT:    lxvd2x 3, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_3 at toc@ha
-; LE-NEXT:    xxland 41, 34, 37
-; LE-NEXT:    vspltisw 7, 6
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_3 at toc@l
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    lxvd2x 6, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_4 at toc@ha
-; LE-NEXT:    xxland 4, 41, 40
-; LE-NEXT:    xxland 40, 34, 3
-; LE-NEXT:    vupklsw 7, 7
-; LE-NEXT:    vsld 7, 3, 7
-; LE-NEXT:    addi 9, 9, .LCPI7_4 at toc@l
-; LE-NEXT:    vcmpgtud 8, 8, 4
-; LE-NEXT:    xxlxor 0, 0, 4
-; LE-NEXT:    lxvd2x 7, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_5 at toc@ha
-; LE-NEXT:    xxland 41, 34, 6
-; LE-NEXT:    xxland 3, 40, 39
-; LE-NEXT:    vsld 7, 3, 10
-; LE-NEXT:    addi 9, 9, .LCPI7_5 at toc@l
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    xxlxor 0, 0, 3
-; LE-NEXT:    lxvd2x 8, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_6 at toc@ha
-; LE-NEXT:    xxland 6, 41, 39
-; LE-NEXT:    xxland 41, 34, 7
-; LE-NEXT:    addi 9, 9, .LCPI7_6 at toc@l
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    xxlxor 0, 0, 6
-; LE-NEXT:    xxland 7, 41, 38
-; LE-NEXT:    xxland 41, 34, 8
-; LE-NEXT:    lxvd2x 8, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_7 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_7 at toc@l
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    xxlxor 0, 0, 7
-; LE-NEXT:    lxvd2x 10, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_8 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_8 at toc@l
-; LE-NEXT:    lxvd2x 11, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_9 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_9 at toc@l
-; LE-NEXT:    lxvd2x 12, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_10 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_10 at toc@l
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_11 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_11 at toc@l
-; LE-NEXT:    lxvd2x 2, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_12 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_12 at toc@l
-; LE-NEXT:    lxvd2x 4, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_13 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_13 at toc@l
-; LE-NEXT:    vspltisw 8, 9
-; LE-NEXT:    vupklsw 7, 8
-; LE-NEXT:    vspltisw 8, 10
-; LE-NEXT:    vsld 6, 3, 7
-; LE-NEXT:    xxland 9, 41, 38
-; LE-NEXT:    xxland 41, 34, 10
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    xxlxor 0, 0, 9
-; LE-NEXT:    vupklsw 7, 8
-; LE-NEXT:    vspltisw 8, 11
-; LE-NEXT:    vsld 6, 3, 7
-; LE-NEXT:    vupklsw 7, 8
-; LE-NEXT:    xxland 40, 34, 8
-; LE-NEXT:    vcmpgtud 8, 8, 4
-; LE-NEXT:    xxland 8, 40, 38
-; LE-NEXT:    vsld 6, 3, 7
-; LE-NEXT:    xxland 10, 41, 38
-; LE-NEXT:    xxland 41, 34, 11
-; LE-NEXT:    vcmpgtud 9, 9, 4
-; LE-NEXT:    xxlxor 0, 0, 8
-; LE-NEXT:    xxlxor 0, 0, 10
-; LE-NEXT:    vspltisw 7, 12
-; LE-NEXT:    vspltisw 8, 13
-; LE-NEXT:    vupklsw 6, 7
-; LE-NEXT:    vsld 6, 3, 6
-; LE-NEXT:    vupklsw 7, 8
-; LE-NEXT:    xxland 11, 41, 38
-; LE-NEXT:    xxland 38, 34, 12
-; LE-NEXT:    vspltisw 8, 14
-; LE-NEXT:    vsld 7, 3, 7
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    xxland 12, 38, 39
-; LE-NEXT:    xxland 39, 34, 1
-; LE-NEXT:    vcmpgtud 7, 7, 4
-; LE-NEXT:    xxlxor 0, 0, 11
-; LE-NEXT:    vupklsw 8, 8
-; LE-NEXT:    vsld 6, 3, 8
-; LE-NEXT:    xxland 1, 39, 38
-; LE-NEXT:    xxland 38, 34, 2
-; LE-NEXT:    vspltisw 9, 15
-; LE-NEXT:    xxlxor 0, 0, 12
-; LE-NEXT:    vcmpgtud 6, 6, 4
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    vupklsw 0, 9
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    xxland 2, 38, 32
-; LE-NEXT:    vsld 0, 3, 1
-; LE-NEXT:    xxland 33, 34, 4
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    xxland 4, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI7_14 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI7_16 at toc@ha
-; LE-NEXT:    lxvd2x 3, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_15 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 4
-; LE-NEXT:    addi 9, 9, .LCPI7_15 at toc@l
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    xxland 33, 34, 3
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 3, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI7_16 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI7_18 at toc@ha
-; LE-NEXT:    lxvd2x 5, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_17 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 3
-; LE-NEXT:    addi 9, 9, .LCPI7_17 at toc@l
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    xxland 33, 34, 5
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 5, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI7_18 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI7_20 at toc@ha
-; LE-NEXT:    lxvd2x 6, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_19 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_19 at toc@l
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    xxland 33, 34, 6
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 6, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI7_20 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI7_22 at toc@ha
-; LE-NEXT:    lxvd2x 7, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_21 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 6
-; LE-NEXT:    addi 9, 9, .LCPI7_21 at toc@l
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    xxland 33, 34, 7
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 7, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI7_22 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI7_24 at toc@ha
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_23 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 7
-; LE-NEXT:    addi 9, 9, .LCPI7_23 at toc@l
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    xxland 33, 34, 1
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI7_24 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI7_26 at toc@ha
-; LE-NEXT:    lxvd2x 2, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_25 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    addi 9, 9, .LCPI7_25 at toc@l
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    xxland 33, 34, 2
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 2, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI7_26 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI7_28 at toc@ha
-; LE-NEXT:    lxvd2x 3, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_27 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    addi 9, 9, .LCPI7_27 at toc@l
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    xxland 33, 34, 3
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 3, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI7_28 at toc@l
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_29 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 3
-; LE-NEXT:    addi 9, 9, .LCPI7_29 at toc@l
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    xxland 33, 34, 1
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_30 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_30 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_31 at toc@ha
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    addi 9, 9, .LCPI7_31 at toc@l
-; LE-NEXT:    xxland 33, 34, 1
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_32 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_32 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_33 at toc@ha
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    addi 9, 9, .LCPI7_33 at toc@l
-; LE-NEXT:    xxland 33, 34, 1
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_34 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_34 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_35 at toc@ha
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    addi 9, 9, .LCPI7_35 at toc@l
-; LE-NEXT:    xxland 33, 34, 1
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_36 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_36 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_37 at toc@ha
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    addi 9, 9, .LCPI7_37 at toc@l
-; LE-NEXT:    xxland 33, 34, 1
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_38 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_38 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_39 at toc@ha
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    addi 9, 9, .LCPI7_39 at toc@l
-; LE-NEXT:    xxland 33, 34, 1
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_40 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_40 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_41 at toc@ha
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    addi 9, 9, .LCPI7_41 at toc@l
-; LE-NEXT:    xxland 33, 34, 1
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_42 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_42 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_43 at toc@ha
-; LE-NEXT:    vsld 0, 3, 0
-; LE-NEXT:    addi 9, 9, .LCPI7_43 at toc@l
-; LE-NEXT:    xxland 33, 34, 1
-; LE-NEXT:    vcmpgtud 1, 1, 4
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_44 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_44 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_45 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_45 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_46 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_46 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_47 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_47 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_48 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_48 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_49 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_49 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_50 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_50 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_51 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_51 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_52 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_52 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_53 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_53 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_54 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_54 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_55 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_55 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_56 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_56 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_57 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_57 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_58 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_58 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_59 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_59 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_60 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_60 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_61 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_61 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_62 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_62 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_63 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_63 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_64 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_64 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_65 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_65 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_66 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_66 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_67 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_67 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_68 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_68 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_69 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_69 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_70 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_70 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_71 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_71 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_72 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_72 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_73 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_73 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_74 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_74 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_75 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_75 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_76 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_76 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_77 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_77 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_78 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_78 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_79 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_79 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_80 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_80 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_81 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_81 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_82 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_82 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_83 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_83 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_84 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_84 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_85 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_85 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_86 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_86 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_87 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_87 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_88 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_88 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_89 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_89 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_90 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_90 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_91 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_91 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_92 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_92 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_93 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_93 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_94 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_94 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_95 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_95 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_96 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_96 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_97 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_97 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_98 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_98 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_99 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_99 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_100 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_100 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_101 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_101 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_102 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_102 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    lxvd2x 37, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_103 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI7_103 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI7_104 at toc@ha
-; LE-NEXT:    vsld 5, 3, 5
-; LE-NEXT:    addi 9, 9, .LCPI7_104 at toc@l
-; LE-NEXT:    xxland 32, 34, 1
-; LE-NEXT:    vcmpgtud 0, 0, 4
-; LE-NEXT:    xxland 1, 32, 37
-; LE-NEXT:    xxleqv 37, 37, 37
-; LE-NEXT:    vsld 3, 3, 5
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    xxland 34, 34, 1
-; LE-NEXT:    vcmpgtud 2, 2, 4
-; LE-NEXT:    xxland 1, 34, 35
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    mffprd 9, 0
-; LE-NEXT:    sldi 10, 9, 1
-; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    and 10, 10, 8
-; LE-NEXT:    and 9, 9, 7
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 2
-; LE-NEXT:    rldicl 9, 9, 62, 2
-; LE-NEXT:    and 10, 10, 6
-; LE-NEXT:    and 9, 9, 5
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 4
-; LE-NEXT:    rldicl 9, 9, 60, 4
-; LE-NEXT:    and 10, 10, 4
-; LE-NEXT:    and 9, 9, 3
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    rldicl 10, 9, 32, 32
-; LE-NEXT:    rotlwi 11, 10, 24
-; LE-NEXT:    rlwimi 11, 10, 8, 8, 15
-; LE-NEXT:    rlwimi 11, 10, 8, 24, 31
-; LE-NEXT:    rotlwi 10, 9, 24
-; LE-NEXT:    rlwimi 10, 9, 8, 8, 15
-; LE-NEXT:    rlwimi 10, 9, 8, 24, 31
-; LE-NEXT:    sldi 9, 10, 32
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    xxswapd 1, 0
-; LE-NEXT:    mtfprd 0, 9
-; LE-NEXT:    mffprd 9, 1
-; LE-NEXT:    sldi 10, 9, 1
-; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    and 8, 10, 8
-; LE-NEXT:    and 7, 9, 7
-; LE-NEXT:    or 7, 7, 8
-; LE-NEXT:    sldi 8, 7, 2
+; LE-NEXT:    std 28, 584(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 592(1) # 8-byte Folded Spill
+; LE-NEXT:    and 11, 11, 28
+; LE-NEXT:    and 3, 3, 29
+; LE-NEXT:    std 18, 640(1) # 8-byte Folded Spill
+; LE-NEXT:    std 19, 648(1) # 8-byte Folded Spill
+; LE-NEXT:    and 4, 4, 28
+; LE-NEXT:    and 5, 5, 29
+; LE-NEXT:    std 20, 656(1) # 8-byte Folded Spill
+; LE-NEXT:    std 21, 664(1) # 8-byte Folded Spill
+; LE-NEXT:    and 8, 8, 28
+; LE-NEXT:    and 9, 9, 29
+; LE-NEXT:    std 22, 672(1) # 8-byte Folded Spill
+; LE-NEXT:    std 23, 680(1) # 8-byte Folded Spill
+; LE-NEXT:    and 12, 12, 28
+; LE-NEXT:    and 10, 10, 29
+; LE-NEXT:    std 24, 688(1) # 8-byte Folded Spill
+; LE-NEXT:    std 25, 696(1) # 8-byte Folded Spill
+; LE-NEXT:    or 3, 3, 11
+; LE-NEXT:    or 4, 5, 4
+; LE-NEXT:    std 26, 704(1) # 8-byte Folded Spill
+; LE-NEXT:    std 31, 744(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 5, 0, 61680
+; LE-NEXT:    ori 11, 30, 3855
+; LE-NEXT:    std 2, 600(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 30, 6, 52428
+; LE-NEXT:    ori 0, 7, 13107
+; LE-NEXT:    std 30, 568(1) # 8-byte Folded Spill
+; LE-NEXT:    std 0, 576(1) # 8-byte Folded Spill
+; LE-NEXT:    or 6, 9, 8
+; LE-NEXT:    or 7, 10, 12
+; LE-NEXT:    sldi 8, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    sldi 9, 4, 2
+; LE-NEXT:    rldicl 4, 4, 62, 2
+; LE-NEXT:    sldi 5, 5, 32
+; LE-NEXT:    sldi 10, 11, 32
+; LE-NEXT:    sldi 11, 6, 2
+; LE-NEXT:    rldicl 6, 6, 62, 2
+; LE-NEXT:    sldi 12, 7, 2
 ; LE-NEXT:    rldicl 7, 7, 62, 2
-; LE-NEXT:    and 6, 8, 6
-; LE-NEXT:    and 5, 7, 5
-; LE-NEXT:    or 5, 5, 6
-; LE-NEXT:    sldi 6, 5, 4
+; LE-NEXT:    and 8, 8, 30
+; LE-NEXT:    and 3, 3, 0
+; LE-NEXT:    and 9, 9, 30
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    oris 5, 5, 61680
+; LE-NEXT:    oris 10, 10, 3855
+; LE-NEXT:    and 11, 11, 30
+; LE-NEXT:    and 6, 6, 0
+; LE-NEXT:    and 12, 12, 30
+; LE-NEXT:    and 7, 7, 0
+; LE-NEXT:    or 3, 3, 8
+; LE-NEXT:    or 4, 4, 9
+; LE-NEXT:    ori 30, 5, 61680
+; LE-NEXT:    std 30, 552(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 0, 10, 3855
+; LE-NEXT:    std 0, 560(1) # 8-byte Folded Spill
+; LE-NEXT:    or 5, 6, 11
+; LE-NEXT:    or 6, 7, 12
+; LE-NEXT:    sldi 7, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    sldi 8, 4, 4
+; LE-NEXT:    rldicl 4, 4, 60, 4
+; LE-NEXT:    sldi 9, 5, 4
 ; LE-NEXT:    rldicl 5, 5, 60, 4
-; LE-NEXT:    and 4, 6, 4
-; LE-NEXT:    and 3, 5, 3
+; LE-NEXT:    sldi 10, 6, 4
+; LE-NEXT:    rldicl 6, 6, 60, 4
+; LE-NEXT:    and 7, 7, 30
+; LE-NEXT:    and 3, 3, 0
+; LE-NEXT:    and 8, 8, 30
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 9, 9, 30
+; LE-NEXT:    and 5, 5, 0
+; LE-NEXT:    and 10, 10, 30
+; LE-NEXT:    and 6, 6, 0
+; LE-NEXT:    or 3, 3, 7
+; LE-NEXT:    or 4, 4, 8
+; LE-NEXT:    or 5, 5, 9
+; LE-NEXT:    or 6, 6, 10
+; LE-NEXT:    rldicl 7, 3, 32, 32
+; LE-NEXT:    rotlwi 8, 3, 24
+; LE-NEXT:    rldicl 9, 4, 32, 32
+; LE-NEXT:    rotlwi 10, 4, 24
+; LE-NEXT:    rldicl 11, 5, 32, 32
+; LE-NEXT:    rotlwi 12, 5, 24
+; LE-NEXT:    rotlwi 29, 7, 24
+; LE-NEXT:    rlwimi 8, 3, 8, 8, 15
+; LE-NEXT:    rotlwi 28, 9, 24
+; LE-NEXT:    rlwimi 10, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 8, 3, 8, 24, 31
+; LE-NEXT:    rlwimi 10, 4, 8, 24, 31
+; LE-NEXT:    rotlwi 4, 11, 24
+; LE-NEXT:    rlwimi 12, 5, 8, 8, 15
+; LE-NEXT:    rlwimi 29, 7, 8, 8, 15
+; LE-NEXT:    sldi 3, 8, 32
+; LE-NEXT:    rlwimi 28, 9, 8, 8, 15
+; LE-NEXT:    sldi 8, 10, 32
+; LE-NEXT:    rlwimi 12, 5, 8, 24, 31
+; LE-NEXT:    rlwimi 29, 7, 8, 24, 31
+; LE-NEXT:    rlwimi 28, 9, 8, 24, 31
+; LE-NEXT:    rlwimi 4, 11, 8, 8, 15
+; LE-NEXT:    sldi 5, 12, 32
+; LE-NEXT:    or 9, 3, 29
+; LE-NEXT:    or 3, 8, 28
+; LE-NEXT:    rlwimi 4, 11, 8, 24, 31
+; LE-NEXT:    or 10, 5, 4
+; LE-NEXT:    rlwinm 4, 3, 0, 30, 30
+; LE-NEXT:    std 4, 544(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 5, 5
+; LE-NEXT:    std 4, 384(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 4, 4
+; LE-NEXT:    std 4, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 3, 3
+; LE-NEXT:    std 4, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 2, 2
+; LE-NEXT:    std 4, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 1, 1
+; LE-NEXT:    std 4, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 0, 0
+; LE-NEXT:    std 4, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    std 4, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 31, 33
+; LE-NEXT:    std 4, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 30, 34
+; LE-NEXT:    std 4, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 29, 35
+; LE-NEXT:    std 4, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 28, 36
+; LE-NEXT:    std 4, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 27, 37
+; LE-NEXT:    std 4, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 26, 38
+; LE-NEXT:    std 4, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 25, 39
+; LE-NEXT:    std 4, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 24, 40
+; LE-NEXT:    rldicl 0, 6, 32, 32
+; LE-NEXT:    rotlwi 30, 6, 24
+; LE-NEXT:    rotlwi 27, 0, 24
+; LE-NEXT:    std 4, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 23, 41
+; LE-NEXT:    rlwimi 30, 6, 8, 8, 15
+; LE-NEXT:    rlwimi 30, 6, 8, 24, 31
+; LE-NEXT:    rlwimi 27, 0, 8, 8, 15
+; LE-NEXT:    std 4, 216(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 22, 42
+; LE-NEXT:    sldi 6, 30, 32
+; LE-NEXT:    rlwimi 27, 0, 8, 24, 31
+; LE-NEXT:    or 11, 6, 27
+; LE-NEXT:    std 4, 208(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 21, 43
+; LE-NEXT:    clrldi 5, 3, 63
+; LE-NEXT:    rlwinm 6, 3, 0, 29, 29
+; LE-NEXT:    rlwinm 7, 3, 0, 28, 28
+; LE-NEXT:    std 4, 200(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 20, 44
+; LE-NEXT:    rlwinm 8, 3, 0, 27, 27
+; LE-NEXT:    rlwinm 12, 3, 0, 26, 26
+; LE-NEXT:    rlwinm 0, 3, 0, 25, 25
+; LE-NEXT:    std 4, 192(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 19, 45
+; LE-NEXT:    rlwinm 30, 3, 0, 24, 24
+; LE-NEXT:    rlwinm 29, 3, 0, 23, 23
+; LE-NEXT:    rlwinm 28, 3, 0, 22, 22
+; LE-NEXT:    std 4, 184(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 18, 46
+; LE-NEXT:    rlwinm 27, 3, 0, 21, 21
+; LE-NEXT:    rlwinm 26, 3, 0, 20, 20
+; LE-NEXT:    rlwinm 25, 3, 0, 19, 19
+; LE-NEXT:    std 4, 176(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 17, 47
+; LE-NEXT:    rlwinm 24, 3, 0, 18, 18
+; LE-NEXT:    rlwinm 23, 3, 0, 17, 17
+; LE-NEXT:    rlwinm 22, 3, 0, 16, 16
+; LE-NEXT:    std 4, 168(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 16, 48
+; LE-NEXT:    rlwinm 21, 3, 0, 15, 15
+; LE-NEXT:    rlwinm 20, 3, 0, 14, 14
+; LE-NEXT:    rlwinm 19, 3, 0, 13, 13
+; LE-NEXT:    std 4, 160(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 15, 49
+; LE-NEXT:    rlwinm 18, 3, 0, 12, 12
+; LE-NEXT:    rlwinm 17, 3, 0, 11, 11
+; LE-NEXT:    rlwinm 16, 3, 0, 10, 10
+; LE-NEXT:    std 4, 152(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 14, 50
+; LE-NEXT:    rlwinm 15, 3, 0, 9, 9
+; LE-NEXT:    rlwinm 14, 3, 0, 8, 8
+; LE-NEXT:    rlwinm 31, 3, 0, 7, 7
+; LE-NEXT:    std 4, 144(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 13, 51
+; LE-NEXT:    rlwinm 2, 3, 0, 6, 6
+; LE-NEXT:    std 4, 136(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 12, 52
+; LE-NEXT:    std 4, 128(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 11, 53
+; LE-NEXT:    std 4, 120(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 10, 54
+; LE-NEXT:    std 4, 112(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 9, 55
+; LE-NEXT:    std 4, 104(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 8, 56
+; LE-NEXT:    std 4, 96(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 7, 57
+; LE-NEXT:    std 4, 88(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 6, 58
+; LE-NEXT:    std 4, 80(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 5, 59
+; LE-NEXT:    std 4, 72(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 4, 60
+; LE-NEXT:    std 4, 64(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 3, 61
+; LE-NEXT:    std 4, 56(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 2, 62
+; LE-NEXT:    rldicr 3, 3, 0, 0
+; LE-NEXT:    std 3, 40(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 544(1) # 8-byte Folded Reload
+; LE-NEXT:    std 4, 48(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 296(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 5
+; LE-NEXT:    std 3, 288(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 6
+; LE-NEXT:    std 3, 304(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 7
+; LE-NEXT:    std 3, 312(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 8
+; LE-NEXT:    std 3, 320(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 12
+; LE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 0
+; LE-NEXT:    std 3, 544(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 30
+; LE-NEXT:    std 3, 536(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 29
+; LE-NEXT:    std 3, 528(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 28
+; LE-NEXT:    std 3, 520(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 27
+; LE-NEXT:    std 3, 512(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 26
+; LE-NEXT:    std 3, 504(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 25
+; LE-NEXT:    std 3, 496(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 24
+; LE-NEXT:    std 3, 488(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 23
+; LE-NEXT:    std 3, 480(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 22
+; LE-NEXT:    std 3, 472(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 21
+; LE-NEXT:    std 3, 464(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 20
+; LE-NEXT:    std 3, 456(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 19
+; LE-NEXT:    std 3, 448(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 18
+; LE-NEXT:    std 3, 440(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 17
+; LE-NEXT:    std 3, 432(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 16
+; LE-NEXT:    std 3, 424(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 15
+; LE-NEXT:    std 3, 416(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 14
+; LE-NEXT:    std 3, 408(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 31
+; LE-NEXT:    std 3, 400(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 2
+; LE-NEXT:    std 3, 392(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 384(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 384(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 376(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 368(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 360(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 352(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 344(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 4, 3, 32, 31
+; LE-NEXT:    ld 3, 280(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 5, 3, 33, 30
+; LE-NEXT:    ld 3, 272(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 6, 3, 34, 29
+; LE-NEXT:    ld 3, 264(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 7, 3, 35, 28
+; LE-NEXT:    ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 8, 3, 36, 27
+; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 12, 3, 37, 26
+; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 0, 3, 38, 25
+; LE-NEXT:    ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 30, 3, 39, 24
+; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 29, 3, 40, 23
+; LE-NEXT:    ld 3, 216(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 28, 3, 41, 22
+; LE-NEXT:    ld 3, 208(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 27, 3, 42, 21
+; LE-NEXT:    ld 3, 200(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 26, 3, 43, 20
+; LE-NEXT:    ld 3, 192(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 25, 3, 44, 19
+; LE-NEXT:    ld 3, 184(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 24, 3, 45, 18
+; LE-NEXT:    ld 3, 176(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 23, 3, 46, 17
+; LE-NEXT:    ld 3, 168(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 22, 3, 47, 16
+; LE-NEXT:    ld 3, 160(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 21, 3, 48, 15
+; LE-NEXT:    ld 3, 152(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 20, 3, 49, 14
+; LE-NEXT:    ld 3, 144(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 19, 3, 50, 13
+; LE-NEXT:    ld 3, 136(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 18, 3, 51, 12
+; LE-NEXT:    ld 3, 128(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 17, 3, 52, 11
+; LE-NEXT:    ld 3, 120(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 16, 3, 53, 10
+; LE-NEXT:    ld 3, 112(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 15, 3, 54, 9
+; LE-NEXT:    ld 3, 104(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 14, 3, 55, 8
+; LE-NEXT:    ld 3, 96(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 31, 3, 56, 7
+; LE-NEXT:    ld 3, 88(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 2, 3, 57, 6
+; LE-NEXT:    ld 3, 80(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 58, 5
+; LE-NEXT:    std 3, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 72(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 59, 4
+; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 64(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 60, 3
+; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 56(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 61, 2
+; LE-NEXT:    std 3, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 48(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 62, 1
+; LE-NEXT:    std 3, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 40(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 4
+; LE-NEXT:    clrldi 4, 9, 63
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    std 3, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 5
+; LE-NEXT:    ld 5, 288(1) # 8-byte Folded Reload
+; LE-NEXT:    std 3, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 6
+; LE-NEXT:    mulld 6, 11, 7
+; LE-NEXT:    mulld 7, 11, 8
+; LE-NEXT:    mulld 8, 11, 12
+; LE-NEXT:    mulld 12, 11, 0
+; LE-NEXT:    mulld 0, 11, 30
+; LE-NEXT:    mulld 30, 11, 29
+; LE-NEXT:    mulld 29, 11, 28
+; LE-NEXT:    mulld 28, 11, 27
+; LE-NEXT:    mulld 27, 11, 26
+; LE-NEXT:    mulld 26, 11, 25
+; LE-NEXT:    mulld 25, 11, 24
+; LE-NEXT:    mulld 24, 11, 23
+; LE-NEXT:    mulld 23, 11, 22
+; LE-NEXT:    mulld 22, 11, 21
+; LE-NEXT:    mulld 21, 11, 20
+; LE-NEXT:    mulld 20, 11, 19
+; LE-NEXT:    mulld 19, 11, 18
+; LE-NEXT:    mulld 18, 11, 17
+; LE-NEXT:    mulld 17, 11, 16
+; LE-NEXT:    mulld 16, 11, 15
+; LE-NEXT:    mulld 15, 11, 14
+; LE-NEXT:    mulld 14, 11, 31
+; LE-NEXT:    mulld 31, 11, 2
+; LE-NEXT:    std 3, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 2, 11, 3
+; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 11, 11, 3
+; LE-NEXT:    rlwinm 3, 9, 0, 30, 30
+; LE-NEXT:    mulld 3, 10, 3
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    ld 4, 296(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 5, 4
+; LE-NEXT:    rlwinm 5, 9, 0, 29, 29
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 304(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 28, 28
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 312(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 27, 27
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 320(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 26, 26
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 328(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 25, 25
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 544(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    ld 4, 536(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 528(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 520(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 512(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 504(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 496(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 488(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 480(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 472(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 464(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 456(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 448(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 440(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 432(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 424(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 416(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 408(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 400(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 392(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 384(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 376(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 368(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 360(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 352(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 344(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 280(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 272(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 264(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 6
+; LE-NEXT:    ld 6, 592(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 7
+; LE-NEXT:    ld 7, 584(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 8
+; LE-NEXT:    ld 8, 576(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 12
+; LE-NEXT:    ld 12, 560(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 0
+; LE-NEXT:    ld 0, 552(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 30
+; LE-NEXT:    ld 30, 736(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 29
+; LE-NEXT:    ld 29, 728(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 28
+; LE-NEXT:    ld 28, 720(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 27
+; LE-NEXT:    ld 27, 712(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 26
+; LE-NEXT:    ld 26, 704(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 25
+; LE-NEXT:    ld 25, 696(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 24
+; LE-NEXT:    ld 24, 688(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 23
+; LE-NEXT:    ld 23, 680(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 22
+; LE-NEXT:    ld 22, 672(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 21
+; LE-NEXT:    ld 21, 664(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 20
+; LE-NEXT:    ld 20, 656(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 19
+; LE-NEXT:    ld 19, 648(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 18
+; LE-NEXT:    ld 18, 640(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 17
+; LE-NEXT:    ld 17, 632(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 16
+; LE-NEXT:    ld 16, 624(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 15
+; LE-NEXT:    ld 15, 616(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 14
+; LE-NEXT:    ld 14, 608(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 31
+; LE-NEXT:    ld 31, 744(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 2
+; LE-NEXT:    ld 2, 600(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 11
+; LE-NEXT:    ld 11, 568(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    and 4, 4, 7
+; LE-NEXT:    and 3, 3, 6
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    and 4, 4, 11
+; LE-NEXT:    and 3, 3, 8
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 3, 3, 12
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    rotlwi 5, 3, 24
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    rlwimi 5, 3, 8, 8, 15
+; LE-NEXT:    rlwimi 5, 3, 8, 24, 31
+; LE-NEXT:    rotlwi 3, 4, 24
+; LE-NEXT:    rlwimi 3, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 3, 4, 8, 24, 31
+; LE-NEXT:    sldi 4, 5, 32
+; LE-NEXT:    or 3, 4, 3
+; LE-NEXT:    ld 4, 328(1) # 8-byte Folded Reload
+; LE-NEXT:    mtfprd 0, 3
+; LE-NEXT:    rlwinm 3, 9, 0, 24, 24
+; LE-NEXT:    mulld 3, 10, 3
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    rlwinm 4, 9, 0, 23, 23
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 22, 22
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 21, 21
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 20, 20
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 19, 19
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 18, 18
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 17, 17
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 16, 16
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 15, 15
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 14, 14
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 13, 13
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 12, 12
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 11, 11
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 10, 10
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 9, 9
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 8, 8
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 7, 7
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 6, 6
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 5, 5
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 4, 4
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 3, 3
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 2, 2
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 1, 1
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 0, 0
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 32, 32
+; LE-NEXT:    rldicl 4, 4, 32, 31
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 31, 33
+; LE-NEXT:    rldicl 4, 4, 33, 30
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 30, 34
+; LE-NEXT:    rldicl 4, 4, 34, 29
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 29, 35
+; LE-NEXT:    rldicl 4, 4, 35, 28
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 28, 36
+; LE-NEXT:    rldicl 4, 4, 36, 27
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 27, 37
+; LE-NEXT:    rldicl 4, 4, 37, 26
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 26, 38
+; LE-NEXT:    rldicl 4, 4, 38, 25
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 25, 39
+; LE-NEXT:    rldicl 4, 4, 39, 24
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 24, 40
+; LE-NEXT:    rldicl 4, 4, 40, 23
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 23, 41
+; LE-NEXT:    rldicl 4, 4, 41, 22
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 22, 42
+; LE-NEXT:    rldicl 4, 4, 42, 21
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 21, 43
+; LE-NEXT:    rldicl 4, 4, 43, 20
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 20, 44
+; LE-NEXT:    rldicl 4, 4, 44, 19
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 19, 45
+; LE-NEXT:    rldicl 4, 4, 45, 18
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 18, 46
+; LE-NEXT:    rldicl 4, 4, 46, 17
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 17, 47
+; LE-NEXT:    rldicl 4, 4, 47, 16
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 16, 48
+; LE-NEXT:    rldicl 4, 4, 48, 15
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 15, 49
+; LE-NEXT:    rldicl 4, 4, 49, 14
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 14, 50
+; LE-NEXT:    rldicl 4, 4, 50, 13
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 13, 51
+; LE-NEXT:    rldicl 4, 4, 51, 12
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 12, 52
+; LE-NEXT:    rldicl 4, 4, 52, 11
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 11, 53
+; LE-NEXT:    rldicl 4, 4, 53, 10
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 10, 54
+; LE-NEXT:    rldicl 4, 4, 54, 9
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 9, 55
+; LE-NEXT:    rldicl 4, 4, 55, 8
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 8, 56
+; LE-NEXT:    rldicl 4, 4, 56, 7
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 7, 57
+; LE-NEXT:    rldicl 4, 4, 57, 6
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 6, 58
+; LE-NEXT:    rldicl 4, 4, 58, 5
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 5, 59
+; LE-NEXT:    rldicl 4, 4, 59, 4
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 4, 60
+; LE-NEXT:    rldicl 4, 4, 60, 3
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 3, 61
+; LE-NEXT:    rldicl 4, 4, 61, 2
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 2, 62
+; LE-NEXT:    rldicl 4, 4, 62, 1
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicr 4, 9, 0, 0
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    and 4, 4, 7
+; LE-NEXT:    and 3, 3, 6
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    and 4, 4, 11
+; LE-NEXT:    and 3, 3, 8
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 3, 3, 12
 ; LE-NEXT:    or 3, 3, 4
 ; LE-NEXT:    rldicl 4, 3, 32, 32
 ; LE-NEXT:    rotlwi 5, 4, 24
@@ -6083,7 +5972,8 @@ define <2 x i64> @clmulr_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    sldi 3, 4, 32
 ; LE-NEXT:    or 3, 3, 5
 ; LE-NEXT:    mtfprd 1, 3
-; LE-NEXT:    xxmrghd 34, 0, 1
+; LE-NEXT:    xxmrghd 34, 1, 0
+; LE-NEXT:    addi 1, 1, 752
 ; LE-NEXT:    blr
   %a.ext = zext <2 x i64> %a to <2 x i128>
   %b.ext = zext <2 x i64> %b to <2 x i128>
@@ -8889,851 +8779,842 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; LE-LABEL: clmulh_v2i64:
 ; LE:       # %bb.0:
-; LE-NEXT:    lis 3, -21846
-; LE-NEXT:    lis 4, 21845
-; LE-NEXT:    lis 5, -13108
-; LE-NEXT:    lis 6, 13107
-; LE-NEXT:    xxswapd 0, 34
-; LE-NEXT:    lis 7, -3856
-; LE-NEXT:    lis 8, 3855
+; LE-NEXT:    stdu 1, -752(1)
+; LE-NEXT:    lis 4, -21846
+; LE-NEXT:    lis 5, 21845
 ; LE-NEXT:    xxswapd 1, 35
-; LE-NEXT:    ori 3, 3, 43690
-; LE-NEXT:    ori 4, 4, 21845
-; LE-NEXT:    ori 5, 5, 52428
-; LE-NEXT:    ori 6, 6, 13107
-; LE-NEXT:    mffprd 9, 0
-; LE-NEXT:    ori 7, 7, 61680
-; LE-NEXT:    ori 8, 8, 3855
-; LE-NEXT:    mffprd 10, 1
-; LE-NEXT:    sldi 3, 3, 32
+; LE-NEXT:    xxswapd 0, 34
+; LE-NEXT:    mfvsrd 3, 35
+; LE-NEXT:    mfvsrd 9, 34
+; LE-NEXT:    lis 6, -13108
+; LE-NEXT:    lis 7, 13107
+; LE-NEXT:    ori 4, 4, 43690
+; LE-NEXT:    ori 5, 5, 21845
+; LE-NEXT:    mffprd 8, 1
+; LE-NEXT:    mffprd 10, 0
+; LE-NEXT:    std 28, 720(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 728(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 6, 6, 52428
+; LE-NEXT:    ori 7, 7, 13107
 ; LE-NEXT:    sldi 4, 4, 32
 ; LE-NEXT:    sldi 5, 5, 32
 ; LE-NEXT:    sldi 6, 6, 32
 ; LE-NEXT:    sldi 7, 7, 32
-; LE-NEXT:    sldi 8, 8, 32
-; LE-NEXT:    vspltisw 4, 2
-; LE-NEXT:    vspltisw 5, 8
-; LE-NEXT:    oris 3, 3, 43690
-; LE-NEXT:    oris 4, 4, 21845
-; LE-NEXT:    vspltisw 8, 1
-; LE-NEXT:    vspltisw 10, 7
-; LE-NEXT:    oris 5, 5, 52428
-; LE-NEXT:    oris 11, 6, 13107
-; LE-NEXT:    vupklsw 0, 4
-; LE-NEXT:    vupklsw 6, 5
-; LE-NEXT:    oris 12, 7, 61680
-; LE-NEXT:    oris 0, 8, 3855
-; LE-NEXT:    xxlxor 37, 37, 37
-; LE-NEXT:    vupklsw 10, 10
-; LE-NEXT:    ori 8, 3, 43690
-; LE-NEXT:    ori 7, 4, 21845
-; LE-NEXT:    ori 6, 5, 52428
-; LE-NEXT:    ori 5, 11, 13107
-; LE-NEXT:    sldi 11, 9, 1
-; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    ori 4, 12, 61680
+; LE-NEXT:    sldi 11, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    std 30, 736(1) # 8-byte Folded Spill
+; LE-NEXT:    lis 0, -3856
+; LE-NEXT:    oris 4, 4, 43690
+; LE-NEXT:    oris 5, 5, 21845
+; LE-NEXT:    lis 30, 3855
+; LE-NEXT:    oris 6, 6, 52428
 ; LE-NEXT:    sldi 12, 10, 1
 ; LE-NEXT:    rldicl 10, 10, 63, 1
-; LE-NEXT:    and 11, 11, 8
-; LE-NEXT:    and 9, 9, 7
-; LE-NEXT:    and 12, 12, 8
-; LE-NEXT:    and 10, 10, 7
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    or 10, 10, 12
-; LE-NEXT:    ori 3, 0, 3855
-; LE-NEXT:    sldi 11, 9, 2
-; LE-NEXT:    rldicl 9, 9, 62, 2
-; LE-NEXT:    sldi 12, 10, 2
-; LE-NEXT:    rldicl 10, 10, 62, 2
-; LE-NEXT:    and 11, 11, 6
-; LE-NEXT:    and 9, 9, 5
-; LE-NEXT:    and 12, 12, 6
-; LE-NEXT:    and 10, 10, 5
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    or 10, 10, 12
-; LE-NEXT:    sldi 11, 9, 4
-; LE-NEXT:    rldicl 9, 9, 60, 4
-; LE-NEXT:    sldi 12, 10, 4
-; LE-NEXT:    rldicl 10, 10, 60, 4
-; LE-NEXT:    and 11, 11, 4
-; LE-NEXT:    and 9, 9, 3
-; LE-NEXT:    and 12, 12, 4
-; LE-NEXT:    and 10, 10, 3
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    or 10, 10, 12
-; LE-NEXT:    rotlwi 12, 9, 24
-; LE-NEXT:    rldicl 11, 9, 32, 32
-; LE-NEXT:    rldicl 0, 10, 32, 32
-; LE-NEXT:    rlwimi 12, 9, 8, 8, 15
-; LE-NEXT:    rlwimi 12, 9, 8, 24, 31
-; LE-NEXT:    rotlwi 9, 10, 24
-; LE-NEXT:    sldi 12, 12, 32
-; LE-NEXT:    rlwimi 9, 10, 8, 8, 15
-; LE-NEXT:    rlwimi 9, 10, 8, 24, 31
-; LE-NEXT:    rotlwi 10, 11, 24
-; LE-NEXT:    sldi 9, 9, 32
-; LE-NEXT:    rlwimi 10, 11, 8, 8, 15
-; LE-NEXT:    rlwimi 10, 11, 8, 24, 31
-; LE-NEXT:    rotlwi 11, 0, 24
-; LE-NEXT:    or 10, 12, 10
-; LE-NEXT:    rlwimi 11, 0, 8, 8, 15
-; LE-NEXT:    rlwimi 11, 0, 8, 24, 31
-; LE-NEXT:    mtfprd 0, 10
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    mtfprd 1, 9
-; LE-NEXT:    mfvsrd 9, 35
-; LE-NEXT:    sldi 10, 9, 1
-; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    and 10, 10, 8
-; LE-NEXT:    and 9, 9, 7
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 2
-; LE-NEXT:    rldicl 9, 9, 62, 2
-; LE-NEXT:    and 10, 10, 6
-; LE-NEXT:    and 9, 9, 5
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 4
-; LE-NEXT:    rldicl 9, 9, 60, 4
-; LE-NEXT:    and 10, 10, 4
-; LE-NEXT:    and 9, 9, 3
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    rldicl 10, 9, 32, 32
-; LE-NEXT:    rotlwi 11, 10, 24
-; LE-NEXT:    rlwimi 11, 10, 8, 8, 15
-; LE-NEXT:    rlwimi 11, 10, 8, 24, 31
-; LE-NEXT:    rotlwi 10, 9, 24
-; LE-NEXT:    rlwimi 10, 9, 8, 8, 15
-; LE-NEXT:    rlwimi 10, 9, 8, 24, 31
-; LE-NEXT:    mfvsrd 9, 34
-; LE-NEXT:    sldi 10, 10, 32
-; LE-NEXT:    or 10, 10, 11
-; LE-NEXT:    mtfprd 2, 10
-; LE-NEXT:    sldi 10, 9, 1
+; LE-NEXT:    oris 7, 7, 13107
+; LE-NEXT:    std 27, 712(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 28, 4, 43690
+; LE-NEXT:    ori 29, 5, 21845
+; LE-NEXT:    std 14, 608(1) # 8-byte Folded Spill
+; LE-NEXT:    std 15, 616(1) # 8-byte Folded Spill
+; LE-NEXT:    sldi 4, 8, 1
+; LE-NEXT:    rldicl 5, 8, 63, 1
+; LE-NEXT:    std 16, 624(1) # 8-byte Folded Spill
+; LE-NEXT:    std 17, 632(1) # 8-byte Folded Spill
+; LE-NEXT:    sldi 8, 9, 1
 ; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    and 10, 10, 8
-; LE-NEXT:    and 9, 9, 7
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 2
-; LE-NEXT:    rldicl 9, 9, 62, 2
-; LE-NEXT:    and 10, 10, 6
-; LE-NEXT:    and 9, 9, 5
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 4
-; LE-NEXT:    rldicl 9, 9, 60, 4
-; LE-NEXT:    and 10, 10, 4
-; LE-NEXT:    and 9, 9, 3
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    rldicl 10, 9, 32, 32
-; LE-NEXT:    rotlwi 11, 10, 24
-; LE-NEXT:    rlwimi 11, 10, 8, 8, 15
-; LE-NEXT:    rlwimi 11, 10, 8, 24, 31
-; LE-NEXT:    rotlwi 10, 9, 24
-; LE-NEXT:    rlwimi 10, 9, 8, 8, 15
-; LE-NEXT:    rlwimi 10, 9, 8, 24, 31
-; LE-NEXT:    sldi 9, 10, 32
-; LE-NEXT:    addis 10, 2, .LCPI11_14 at toc@ha
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    vspltisw 2, 4
-; LE-NEXT:    vupklsw 1, 2
-; LE-NEXT:    vspltisw 2, 3
-; LE-NEXT:    vupklsw 9, 2
-; LE-NEXT:    xxmrghd 35, 2, 1
-; LE-NEXT:    xxland 34, 35, 32
-; LE-NEXT:    vcmpgtud 2, 2, 5
-; LE-NEXT:    mtfprd 1, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_0 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_0 at toc@l
-; LE-NEXT:    xxmrghd 36, 1, 0
-; LE-NEXT:    vaddudm 7, 4, 4
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    xxland 0, 34, 39
-; LE-NEXT:    xxland 39, 35, 33
-; LE-NEXT:    vcmpgtud 7, 7, 5
-; LE-NEXT:    xxland 1, 39, 32
-; LE-NEXT:    xxland 39, 35, 38
-; LE-NEXT:    vsld 0, 4, 9
-; LE-NEXT:    vsld 6, 4, 6
-; LE-NEXT:    vcmpgtud 7, 7, 5
-; LE-NEXT:    xxland 2, 39, 32
-; LE-NEXT:    vsld 0, 4, 1
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_1 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_1 at toc@l
-; LE-NEXT:    vupklsw 2, 8
-; LE-NEXT:    vspltisw 8, 5
-; LE-NEXT:    xxland 41, 35, 33
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    vcmpgtud 9, 9, 5
-; LE-NEXT:    xxland 5, 41, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_2 at toc@ha
-; LE-NEXT:    vupklsw 8, 8
-; LE-NEXT:    vsld 8, 4, 8
-; LE-NEXT:    addi 9, 9, .LCPI11_2 at toc@l
-; LE-NEXT:    lxvd2x 3, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_3 at toc@ha
-; LE-NEXT:    xxland 41, 35, 32
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_3 at toc@l
-; LE-NEXT:    vcmpgtud 9, 9, 5
-; LE-NEXT:    lxvd2x 6, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_4 at toc@ha
-; LE-NEXT:    xxland 4, 41, 40
-; LE-NEXT:    xxland 40, 35, 3
-; LE-NEXT:    addi 9, 9, .LCPI11_4 at toc@l
-; LE-NEXT:    vcmpgtud 8, 8, 5
-; LE-NEXT:    lxvd2x 7, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_5 at toc@ha
-; LE-NEXT:    xxland 41, 35, 6
-; LE-NEXT:    addi 9, 9, .LCPI11_5 at toc@l
-; LE-NEXT:    vcmpgtud 9, 9, 5
-; LE-NEXT:    lxvd2x 8, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_6 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_6 at toc@l
-; LE-NEXT:    vspltisw 7, 6
-; LE-NEXT:    vupklsw 7, 7
-; LE-NEXT:    vsld 7, 4, 7
-; LE-NEXT:    xxland 3, 40, 39
-; LE-NEXT:    vsld 7, 4, 10
-; LE-NEXT:    xxland 6, 41, 39
-; LE-NEXT:    xxland 41, 35, 7
-; LE-NEXT:    vcmpgtud 9, 9, 5
-; LE-NEXT:    xxland 7, 41, 38
-; LE-NEXT:    xxland 41, 35, 8
-; LE-NEXT:    lxvd2x 8, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_7 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_7 at toc@l
-; LE-NEXT:    vcmpgtud 9, 9, 5
-; LE-NEXT:    lxvd2x 10, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_8 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_8 at toc@l
-; LE-NEXT:    lxvd2x 11, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_9 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_9 at toc@l
-; LE-NEXT:    lxvd2x 12, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_10 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_10 at toc@l
-; LE-NEXT:    vspltisw 8, 9
-; LE-NEXT:    vupklsw 7, 8
-; LE-NEXT:    vspltisw 8, 10
-; LE-NEXT:    vsld 6, 4, 7
-; LE-NEXT:    xxland 9, 41, 38
-; LE-NEXT:    xxland 41, 35, 10
-; LE-NEXT:    vcmpgtud 9, 9, 5
-; LE-NEXT:    vupklsw 7, 8
-; LE-NEXT:    vspltisw 8, 11
-; LE-NEXT:    vsld 6, 4, 7
-; LE-NEXT:    vupklsw 7, 8
-; LE-NEXT:    xxland 40, 35, 8
-; LE-NEXT:    vcmpgtud 8, 8, 5
-; LE-NEXT:    xxland 8, 40, 38
-; LE-NEXT:    vsld 6, 4, 7
-; LE-NEXT:    xxland 10, 41, 38
-; LE-NEXT:    xxland 41, 35, 11
-; LE-NEXT:    vcmpgtud 9, 9, 5
-; LE-NEXT:    vspltisw 7, 12
-; LE-NEXT:    vspltisw 8, 13
-; LE-NEXT:    vupklsw 6, 7
-; LE-NEXT:    vsld 6, 4, 6
-; LE-NEXT:    vupklsw 7, 8
-; LE-NEXT:    vsld 7, 4, 7
-; LE-NEXT:    vspltisw 8, 14
-; LE-NEXT:    xxland 11, 41, 38
-; LE-NEXT:    xxland 38, 35, 12
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 12, 38, 39
-; LE-NEXT:    xxland 38, 35, 34
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 13, 38, 36
-; LE-NEXT:    vupklsw 8, 8
-; LE-NEXT:    vsld 7, 4, 8
-; LE-NEXT:    xxlxor 0, 13, 0
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_11 at toc@ha
-; LE-NEXT:    vspltisw 9, 15
-; LE-NEXT:    addi 9, 9, .LCPI11_11 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    lxvd2x 2, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_12 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 5
-; LE-NEXT:    xxland 40, 35, 1
-; LE-NEXT:    addi 9, 9, .LCPI11_12 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 4
-; LE-NEXT:    vcmpgtud 8, 8, 5
-; LE-NEXT:    lxvd2x 4, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_13 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 3
-; LE-NEXT:    xxland 1, 40, 39
-; LE-NEXT:    xxland 39, 35, 2
-; LE-NEXT:    addi 9, 9, .LCPI11_13 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 6
-; LE-NEXT:    vcmpgtud 7, 7, 5
-; LE-NEXT:    xxlxor 0, 0, 7
-; LE-NEXT:    xxlxor 0, 0, 9
-; LE-NEXT:    xxlxor 0, 0, 8
-; LE-NEXT:    xxlxor 0, 0, 10
-; LE-NEXT:    xxlxor 0, 0, 11
-; LE-NEXT:    xxlxor 0, 0, 12
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    vupklsw 6, 9
-; LE-NEXT:    vsld 6, 4, 6
-; LE-NEXT:    xxland 2, 39, 38
-; LE-NEXT:    xxland 38, 35, 4
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 4, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI11_14 at toc@l
-; LE-NEXT:    lxvd2x 3, 0, 9
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    addis 9, 2, .LCPI11_15 at toc@ha
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    addis 10, 2, .LCPI11_16 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_15 at toc@l
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    xxland 3, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI11_16 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI11_18 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 4
-; LE-NEXT:    lxvd2x 5, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_17 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 3
-; LE-NEXT:    addi 9, 9, .LCPI11_17 at toc@l
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    xxland 38, 35, 5
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 5, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI11_18 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI11_20 at toc@ha
-; LE-NEXT:    lxvd2x 6, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_19 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 5
-; LE-NEXT:    addi 9, 9, .LCPI11_19 at toc@l
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    xxland 38, 35, 6
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 6, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI11_20 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI11_22 at toc@ha
-; LE-NEXT:    lxvd2x 7, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_21 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 6
-; LE-NEXT:    addi 9, 9, .LCPI11_21 at toc@l
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    xxland 38, 35, 7
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 7, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI11_22 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI11_24 at toc@ha
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_23 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 7
-; LE-NEXT:    addi 9, 9, .LCPI11_23 at toc@l
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI11_24 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI11_26 at toc@ha
-; LE-NEXT:    lxvd2x 2, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_25 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    addi 9, 9, .LCPI11_25 at toc@l
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    xxland 38, 35, 2
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 2, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI11_26 at toc@l
-; LE-NEXT:    addis 10, 2, .LCPI11_28 at toc@ha
-; LE-NEXT:    lxvd2x 3, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_27 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 2
-; LE-NEXT:    addi 9, 9, .LCPI11_27 at toc@l
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    xxland 38, 35, 3
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 3, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addi 9, 10, .LCPI11_28 at toc@l
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_29 at toc@ha
-; LE-NEXT:    xxlxor 0, 0, 3
-; LE-NEXT:    addi 9, 9, .LCPI11_29 at toc@l
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_30 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_30 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_31 at toc@ha
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    addi 9, 9, .LCPI11_31 at toc@l
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_32 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_32 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_33 at toc@ha
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    addi 9, 9, .LCPI11_33 at toc@l
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_34 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_34 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_35 at toc@ha
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    addi 9, 9, .LCPI11_35 at toc@l
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_36 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_36 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_37 at toc@ha
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    addi 9, 9, .LCPI11_37 at toc@l
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_38 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_38 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_39 at toc@ha
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    addi 9, 9, .LCPI11_39 at toc@l
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_40 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_40 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_41 at toc@ha
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    addi 9, 9, .LCPI11_41 at toc@l
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    lxvd2x 33, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_42 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_42 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_43 at toc@ha
-; LE-NEXT:    vsld 1, 4, 1
-; LE-NEXT:    addi 9, 9, .LCPI11_43 at toc@l
-; LE-NEXT:    xxland 38, 35, 1
-; LE-NEXT:    vcmpgtud 6, 6, 5
-; LE-NEXT:    xxland 1, 38, 33
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_44 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_44 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_45 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_45 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_46 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_46 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_47 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_47 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_48 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_48 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_49 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_49 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_50 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_50 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_51 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_51 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_52 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_52 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_53 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_53 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_54 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_54 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_55 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_55 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_56 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_56 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_57 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_57 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_58 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_58 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_59 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_59 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_60 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_60 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_61 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_61 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_62 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_62 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_63 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_63 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_64 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_64 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_65 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_65 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_66 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_66 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_67 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_67 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_68 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_68 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_69 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_69 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_70 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_70 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_71 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_71 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_72 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_72 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_73 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_73 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_74 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_74 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_75 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_75 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_76 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_76 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_77 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_77 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_78 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_78 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_79 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_79 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_80 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_80 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_81 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_81 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_82 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_82 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_83 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_83 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_84 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_84 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_85 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_85 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_86 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_86 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_87 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_87 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_88 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_88 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_89 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_89 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_90 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_90 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_91 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_91 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_92 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_92 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_93 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_93 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_94 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_94 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_95 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_95 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_96 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_96 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_97 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_97 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_98 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_98 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_99 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_99 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_100 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_100 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_101 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_101 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_102 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_102 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    lxvd2x 32, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_103 at toc@ha
-; LE-NEXT:    addi 9, 9, .LCPI11_103 at toc@l
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    addis 9, 2, .LCPI11_104 at toc@ha
-; LE-NEXT:    vsld 0, 4, 0
-; LE-NEXT:    addi 9, 9, .LCPI11_104 at toc@l
-; LE-NEXT:    xxland 33, 35, 1
-; LE-NEXT:    vcmpgtud 1, 1, 5
-; LE-NEXT:    xxland 1, 33, 32
-; LE-NEXT:    xxleqv 32, 32, 32
-; LE-NEXT:    vsld 4, 4, 0
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    lxvd2x 1, 0, 9
-; LE-NEXT:    xxland 35, 35, 1
-; LE-NEXT:    vcmpgtud 3, 3, 5
-; LE-NEXT:    xxland 1, 35, 36
-; LE-NEXT:    xxlxor 0, 0, 1
-; LE-NEXT:    mffprd 9, 0
-; LE-NEXT:    sldi 10, 9, 1
-; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    and 10, 10, 8
-; LE-NEXT:    and 9, 9, 7
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 2
-; LE-NEXT:    rldicl 9, 9, 62, 2
-; LE-NEXT:    and 10, 10, 6
-; LE-NEXT:    and 9, 9, 5
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    sldi 10, 9, 4
-; LE-NEXT:    rldicl 9, 9, 60, 4
-; LE-NEXT:    and 10, 10, 4
-; LE-NEXT:    and 9, 9, 3
-; LE-NEXT:    or 9, 9, 10
-; LE-NEXT:    rldicl 10, 9, 32, 32
-; LE-NEXT:    rotlwi 11, 10, 24
-; LE-NEXT:    rlwimi 11, 10, 8, 8, 15
-; LE-NEXT:    rlwimi 11, 10, 8, 24, 31
-; LE-NEXT:    rotlwi 10, 9, 24
-; LE-NEXT:    rlwimi 10, 9, 8, 8, 15
-; LE-NEXT:    rlwimi 10, 9, 8, 24, 31
-; LE-NEXT:    sldi 9, 10, 32
-; LE-NEXT:    or 9, 9, 11
-; LE-NEXT:    xxswapd 1, 0
-; LE-NEXT:    mtfprd 0, 9
-; LE-NEXT:    mffprd 9, 1
-; LE-NEXT:    sldi 10, 9, 1
-; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    and 8, 10, 8
-; LE-NEXT:    and 7, 9, 7
-; LE-NEXT:    or 7, 7, 8
-; LE-NEXT:    sldi 8, 7, 2
+; LE-NEXT:    std 28, 584(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 592(1) # 8-byte Folded Spill
+; LE-NEXT:    and 11, 11, 28
+; LE-NEXT:    and 3, 3, 29
+; LE-NEXT:    std 18, 640(1) # 8-byte Folded Spill
+; LE-NEXT:    std 19, 648(1) # 8-byte Folded Spill
+; LE-NEXT:    and 4, 4, 28
+; LE-NEXT:    and 5, 5, 29
+; LE-NEXT:    std 20, 656(1) # 8-byte Folded Spill
+; LE-NEXT:    std 21, 664(1) # 8-byte Folded Spill
+; LE-NEXT:    and 8, 8, 28
+; LE-NEXT:    and 9, 9, 29
+; LE-NEXT:    std 22, 672(1) # 8-byte Folded Spill
+; LE-NEXT:    std 23, 680(1) # 8-byte Folded Spill
+; LE-NEXT:    and 12, 12, 28
+; LE-NEXT:    and 10, 10, 29
+; LE-NEXT:    std 24, 688(1) # 8-byte Folded Spill
+; LE-NEXT:    std 25, 696(1) # 8-byte Folded Spill
+; LE-NEXT:    or 3, 3, 11
+; LE-NEXT:    or 4, 5, 4
+; LE-NEXT:    std 26, 704(1) # 8-byte Folded Spill
+; LE-NEXT:    std 31, 744(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 5, 0, 61680
+; LE-NEXT:    ori 11, 30, 3855
+; LE-NEXT:    std 2, 600(1) # 8-byte Folded Spill
+; LE-NEXT:    vspltisw 2, 1
+; LE-NEXT:    ori 30, 6, 52428
+; LE-NEXT:    ori 0, 7, 13107
+; LE-NEXT:    std 30, 568(1) # 8-byte Folded Spill
+; LE-NEXT:    std 0, 576(1) # 8-byte Folded Spill
+; LE-NEXT:    or 6, 9, 8
+; LE-NEXT:    or 7, 10, 12
+; LE-NEXT:    vupklsw 2, 2
+; LE-NEXT:    sldi 8, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    sldi 9, 4, 2
+; LE-NEXT:    rldicl 4, 4, 62, 2
+; LE-NEXT:    sldi 5, 5, 32
+; LE-NEXT:    sldi 10, 11, 32
+; LE-NEXT:    sldi 11, 6, 2
+; LE-NEXT:    rldicl 6, 6, 62, 2
+; LE-NEXT:    sldi 12, 7, 2
 ; LE-NEXT:    rldicl 7, 7, 62, 2
-; LE-NEXT:    and 6, 8, 6
-; LE-NEXT:    and 5, 7, 5
-; LE-NEXT:    or 5, 5, 6
-; LE-NEXT:    sldi 6, 5, 4
+; LE-NEXT:    and 8, 8, 30
+; LE-NEXT:    and 3, 3, 0
+; LE-NEXT:    and 9, 9, 30
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    oris 5, 5, 61680
+; LE-NEXT:    oris 10, 10, 3855
+; LE-NEXT:    and 11, 11, 30
+; LE-NEXT:    and 6, 6, 0
+; LE-NEXT:    and 12, 12, 30
+; LE-NEXT:    and 7, 7, 0
+; LE-NEXT:    or 3, 3, 8
+; LE-NEXT:    or 4, 4, 9
+; LE-NEXT:    ori 30, 5, 61680
+; LE-NEXT:    std 30, 552(1) # 8-byte Folded Spill
+; LE-NEXT:    ori 0, 10, 3855
+; LE-NEXT:    std 0, 560(1) # 8-byte Folded Spill
+; LE-NEXT:    or 5, 6, 11
+; LE-NEXT:    or 6, 7, 12
+; LE-NEXT:    sldi 7, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    sldi 8, 4, 4
+; LE-NEXT:    rldicl 4, 4, 60, 4
+; LE-NEXT:    sldi 9, 5, 4
 ; LE-NEXT:    rldicl 5, 5, 60, 4
-; LE-NEXT:    and 4, 6, 4
-; LE-NEXT:    and 3, 5, 3
+; LE-NEXT:    sldi 10, 6, 4
+; LE-NEXT:    rldicl 6, 6, 60, 4
+; LE-NEXT:    and 7, 7, 30
+; LE-NEXT:    and 3, 3, 0
+; LE-NEXT:    and 8, 8, 30
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 9, 9, 30
+; LE-NEXT:    and 5, 5, 0
+; LE-NEXT:    and 10, 10, 30
+; LE-NEXT:    and 6, 6, 0
+; LE-NEXT:    or 3, 3, 7
+; LE-NEXT:    or 4, 4, 8
+; LE-NEXT:    or 5, 5, 9
+; LE-NEXT:    or 6, 6, 10
+; LE-NEXT:    rldicl 7, 3, 32, 32
+; LE-NEXT:    rotlwi 8, 3, 24
+; LE-NEXT:    rldicl 9, 4, 32, 32
+; LE-NEXT:    rotlwi 10, 4, 24
+; LE-NEXT:    rldicl 11, 5, 32, 32
+; LE-NEXT:    rotlwi 12, 5, 24
+; LE-NEXT:    rotlwi 29, 7, 24
+; LE-NEXT:    rlwimi 8, 3, 8, 8, 15
+; LE-NEXT:    rotlwi 28, 9, 24
+; LE-NEXT:    rlwimi 10, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 8, 3, 8, 24, 31
+; LE-NEXT:    rlwimi 10, 4, 8, 24, 31
+; LE-NEXT:    rotlwi 4, 11, 24
+; LE-NEXT:    rlwimi 12, 5, 8, 8, 15
+; LE-NEXT:    rlwimi 29, 7, 8, 8, 15
+; LE-NEXT:    sldi 3, 8, 32
+; LE-NEXT:    rlwimi 28, 9, 8, 8, 15
+; LE-NEXT:    sldi 8, 10, 32
+; LE-NEXT:    rlwimi 12, 5, 8, 24, 31
+; LE-NEXT:    rlwimi 29, 7, 8, 24, 31
+; LE-NEXT:    rlwimi 28, 9, 8, 24, 31
+; LE-NEXT:    rlwimi 4, 11, 8, 8, 15
+; LE-NEXT:    sldi 5, 12, 32
+; LE-NEXT:    or 9, 3, 29
+; LE-NEXT:    or 3, 8, 28
+; LE-NEXT:    rlwimi 4, 11, 8, 24, 31
+; LE-NEXT:    or 10, 5, 4
+; LE-NEXT:    rlwinm 4, 3, 0, 30, 30
+; LE-NEXT:    std 4, 544(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 5, 5
+; LE-NEXT:    std 4, 384(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 4, 4
+; LE-NEXT:    std 4, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 3, 3
+; LE-NEXT:    std 4, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 2, 2
+; LE-NEXT:    std 4, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 1, 1
+; LE-NEXT:    std 4, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    rlwinm 4, 3, 0, 0, 0
+; LE-NEXT:    std 4, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    std 4, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 31, 33
+; LE-NEXT:    std 4, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 30, 34
+; LE-NEXT:    std 4, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 29, 35
+; LE-NEXT:    std 4, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 28, 36
+; LE-NEXT:    std 4, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 27, 37
+; LE-NEXT:    std 4, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 26, 38
+; LE-NEXT:    std 4, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 25, 39
+; LE-NEXT:    std 4, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 24, 40
+; LE-NEXT:    rldicl 0, 6, 32, 32
+; LE-NEXT:    rotlwi 30, 6, 24
+; LE-NEXT:    rotlwi 27, 0, 24
+; LE-NEXT:    std 4, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 23, 41
+; LE-NEXT:    rlwimi 30, 6, 8, 8, 15
+; LE-NEXT:    rlwimi 30, 6, 8, 24, 31
+; LE-NEXT:    rlwimi 27, 0, 8, 8, 15
+; LE-NEXT:    std 4, 216(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 22, 42
+; LE-NEXT:    sldi 6, 30, 32
+; LE-NEXT:    rlwimi 27, 0, 8, 24, 31
+; LE-NEXT:    or 11, 6, 27
+; LE-NEXT:    std 4, 208(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 21, 43
+; LE-NEXT:    clrldi 5, 3, 63
+; LE-NEXT:    rlwinm 6, 3, 0, 29, 29
+; LE-NEXT:    rlwinm 7, 3, 0, 28, 28
+; LE-NEXT:    std 4, 200(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 20, 44
+; LE-NEXT:    rlwinm 8, 3, 0, 27, 27
+; LE-NEXT:    rlwinm 12, 3, 0, 26, 26
+; LE-NEXT:    rlwinm 0, 3, 0, 25, 25
+; LE-NEXT:    std 4, 192(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 19, 45
+; LE-NEXT:    rlwinm 30, 3, 0, 24, 24
+; LE-NEXT:    rlwinm 29, 3, 0, 23, 23
+; LE-NEXT:    rlwinm 28, 3, 0, 22, 22
+; LE-NEXT:    std 4, 184(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 18, 46
+; LE-NEXT:    rlwinm 27, 3, 0, 21, 21
+; LE-NEXT:    rlwinm 26, 3, 0, 20, 20
+; LE-NEXT:    rlwinm 25, 3, 0, 19, 19
+; LE-NEXT:    std 4, 176(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 17, 47
+; LE-NEXT:    rlwinm 24, 3, 0, 18, 18
+; LE-NEXT:    rlwinm 23, 3, 0, 17, 17
+; LE-NEXT:    rlwinm 22, 3, 0, 16, 16
+; LE-NEXT:    std 4, 168(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 16, 48
+; LE-NEXT:    rlwinm 21, 3, 0, 15, 15
+; LE-NEXT:    rlwinm 20, 3, 0, 14, 14
+; LE-NEXT:    rlwinm 19, 3, 0, 13, 13
+; LE-NEXT:    std 4, 160(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 15, 49
+; LE-NEXT:    rlwinm 18, 3, 0, 12, 12
+; LE-NEXT:    rlwinm 17, 3, 0, 11, 11
+; LE-NEXT:    rlwinm 16, 3, 0, 10, 10
+; LE-NEXT:    std 4, 152(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 14, 50
+; LE-NEXT:    rlwinm 15, 3, 0, 9, 9
+; LE-NEXT:    rlwinm 14, 3, 0, 8, 8
+; LE-NEXT:    rlwinm 31, 3, 0, 7, 7
+; LE-NEXT:    std 4, 144(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 13, 51
+; LE-NEXT:    rlwinm 2, 3, 0, 6, 6
+; LE-NEXT:    std 4, 136(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 12, 52
+; LE-NEXT:    std 4, 128(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 11, 53
+; LE-NEXT:    std 4, 120(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 10, 54
+; LE-NEXT:    std 4, 112(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 9, 55
+; LE-NEXT:    std 4, 104(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 8, 56
+; LE-NEXT:    std 4, 96(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 7, 57
+; LE-NEXT:    std 4, 88(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 6, 58
+; LE-NEXT:    std 4, 80(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 5, 59
+; LE-NEXT:    std 4, 72(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 4, 60
+; LE-NEXT:    std 4, 64(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 3, 61
+; LE-NEXT:    std 4, 56(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 2, 62
+; LE-NEXT:    rldicr 3, 3, 0, 0
+; LE-NEXT:    std 3, 40(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 544(1) # 8-byte Folded Reload
+; LE-NEXT:    std 4, 48(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 296(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 5
+; LE-NEXT:    std 3, 288(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 6
+; LE-NEXT:    std 3, 304(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 7
+; LE-NEXT:    std 3, 312(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 8
+; LE-NEXT:    std 3, 320(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 12
+; LE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 0
+; LE-NEXT:    std 3, 544(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 30
+; LE-NEXT:    std 3, 536(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 29
+; LE-NEXT:    std 3, 528(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 28
+; LE-NEXT:    std 3, 520(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 27
+; LE-NEXT:    std 3, 512(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 26
+; LE-NEXT:    std 3, 504(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 25
+; LE-NEXT:    std 3, 496(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 24
+; LE-NEXT:    std 3, 488(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 23
+; LE-NEXT:    std 3, 480(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 22
+; LE-NEXT:    std 3, 472(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 21
+; LE-NEXT:    std 3, 464(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 20
+; LE-NEXT:    std 3, 456(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 19
+; LE-NEXT:    std 3, 448(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 18
+; LE-NEXT:    std 3, 440(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 17
+; LE-NEXT:    std 3, 432(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 16
+; LE-NEXT:    std 3, 424(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 15
+; LE-NEXT:    std 3, 416(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 14
+; LE-NEXT:    std 3, 408(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 31
+; LE-NEXT:    std 3, 400(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 2
+; LE-NEXT:    std 3, 392(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 384(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 384(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 376(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 368(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 360(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 352(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 344(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 4, 3, 32, 31
+; LE-NEXT:    ld 3, 280(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 5, 3, 33, 30
+; LE-NEXT:    ld 3, 272(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 6, 3, 34, 29
+; LE-NEXT:    ld 3, 264(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 7, 3, 35, 28
+; LE-NEXT:    ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 8, 3, 36, 27
+; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 12, 3, 37, 26
+; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 0, 3, 38, 25
+; LE-NEXT:    ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 30, 3, 39, 24
+; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 29, 3, 40, 23
+; LE-NEXT:    ld 3, 216(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 28, 3, 41, 22
+; LE-NEXT:    ld 3, 208(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 27, 3, 42, 21
+; LE-NEXT:    ld 3, 200(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 26, 3, 43, 20
+; LE-NEXT:    ld 3, 192(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 25, 3, 44, 19
+; LE-NEXT:    ld 3, 184(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 24, 3, 45, 18
+; LE-NEXT:    ld 3, 176(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 23, 3, 46, 17
+; LE-NEXT:    ld 3, 168(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 22, 3, 47, 16
+; LE-NEXT:    ld 3, 160(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 21, 3, 48, 15
+; LE-NEXT:    ld 3, 152(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 20, 3, 49, 14
+; LE-NEXT:    ld 3, 144(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 19, 3, 50, 13
+; LE-NEXT:    ld 3, 136(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 18, 3, 51, 12
+; LE-NEXT:    ld 3, 128(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 17, 3, 52, 11
+; LE-NEXT:    ld 3, 120(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 16, 3, 53, 10
+; LE-NEXT:    ld 3, 112(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 15, 3, 54, 9
+; LE-NEXT:    ld 3, 104(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 14, 3, 55, 8
+; LE-NEXT:    ld 3, 96(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 31, 3, 56, 7
+; LE-NEXT:    ld 3, 88(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 2, 3, 57, 6
+; LE-NEXT:    ld 3, 80(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 58, 5
+; LE-NEXT:    std 3, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 72(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 59, 4
+; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 64(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 60, 3
+; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 56(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 61, 2
+; LE-NEXT:    std 3, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 48(1) # 8-byte Folded Reload
+; LE-NEXT:    rldicl 3, 3, 62, 1
+; LE-NEXT:    std 3, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 40(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 4
+; LE-NEXT:    clrldi 4, 9, 63
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    std 3, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 5
+; LE-NEXT:    ld 5, 288(1) # 8-byte Folded Reload
+; LE-NEXT:    std 3, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 6
+; LE-NEXT:    mulld 6, 11, 7
+; LE-NEXT:    mulld 7, 11, 8
+; LE-NEXT:    mulld 8, 11, 12
+; LE-NEXT:    mulld 12, 11, 0
+; LE-NEXT:    mulld 0, 11, 30
+; LE-NEXT:    mulld 30, 11, 29
+; LE-NEXT:    mulld 29, 11, 28
+; LE-NEXT:    mulld 28, 11, 27
+; LE-NEXT:    mulld 27, 11, 26
+; LE-NEXT:    mulld 26, 11, 25
+; LE-NEXT:    mulld 25, 11, 24
+; LE-NEXT:    mulld 24, 11, 23
+; LE-NEXT:    mulld 23, 11, 22
+; LE-NEXT:    mulld 22, 11, 21
+; LE-NEXT:    mulld 21, 11, 20
+; LE-NEXT:    mulld 20, 11, 19
+; LE-NEXT:    mulld 19, 11, 18
+; LE-NEXT:    mulld 18, 11, 17
+; LE-NEXT:    mulld 17, 11, 16
+; LE-NEXT:    mulld 16, 11, 15
+; LE-NEXT:    mulld 15, 11, 14
+; LE-NEXT:    mulld 14, 11, 31
+; LE-NEXT:    mulld 31, 11, 2
+; LE-NEXT:    std 3, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 2, 11, 3
+; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 11, 11, 3
+; LE-NEXT:    rlwinm 3, 9, 0, 30, 30
+; LE-NEXT:    mulld 3, 10, 3
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    ld 4, 296(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 5, 4
+; LE-NEXT:    rlwinm 5, 9, 0, 29, 29
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 304(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 28, 28
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 312(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 27, 27
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 320(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 26, 26
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    ld 5, 328(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 4, 4, 5
+; LE-NEXT:    rlwinm 5, 9, 0, 25, 25
+; LE-NEXT:    mulld 5, 10, 5
+; LE-NEXT:    xor 3, 3, 5
+; LE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 544(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    ld 4, 536(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 528(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 520(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 512(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 504(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 496(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 488(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 480(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 472(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 464(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 456(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 448(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 440(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 432(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 424(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 416(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 408(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 400(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 392(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 384(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 376(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 368(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 360(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 352(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 344(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 280(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 272(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 264(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 6
+; LE-NEXT:    ld 6, 592(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 7
+; LE-NEXT:    ld 7, 584(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 8
+; LE-NEXT:    ld 8, 576(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 12
+; LE-NEXT:    ld 12, 560(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 0
+; LE-NEXT:    ld 0, 552(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 30
+; LE-NEXT:    ld 30, 736(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 29
+; LE-NEXT:    ld 29, 728(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 28
+; LE-NEXT:    ld 28, 720(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 27
+; LE-NEXT:    ld 27, 712(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 26
+; LE-NEXT:    ld 26, 704(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 25
+; LE-NEXT:    ld 25, 696(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 24
+; LE-NEXT:    ld 24, 688(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 23
+; LE-NEXT:    ld 23, 680(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 22
+; LE-NEXT:    ld 22, 672(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 21
+; LE-NEXT:    ld 21, 664(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 20
+; LE-NEXT:    ld 20, 656(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 19
+; LE-NEXT:    ld 19, 648(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 18
+; LE-NEXT:    ld 18, 640(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 17
+; LE-NEXT:    ld 17, 632(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 16
+; LE-NEXT:    ld 16, 624(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 15
+; LE-NEXT:    ld 15, 616(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 14
+; LE-NEXT:    ld 14, 608(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 31
+; LE-NEXT:    ld 31, 744(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 2
+; LE-NEXT:    ld 2, 600(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 11
+; LE-NEXT:    ld 11, 568(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    and 4, 4, 7
+; LE-NEXT:    and 3, 3, 6
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    and 4, 4, 11
+; LE-NEXT:    and 3, 3, 8
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 3, 3, 12
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    rotlwi 5, 3, 24
+; LE-NEXT:    rldicl 4, 3, 32, 32
+; LE-NEXT:    rlwimi 5, 3, 8, 8, 15
+; LE-NEXT:    rlwimi 5, 3, 8, 24, 31
+; LE-NEXT:    rotlwi 3, 4, 24
+; LE-NEXT:    rlwimi 3, 4, 8, 8, 15
+; LE-NEXT:    rlwimi 3, 4, 8, 24, 31
+; LE-NEXT:    sldi 4, 5, 32
+; LE-NEXT:    or 3, 4, 3
+; LE-NEXT:    ld 4, 328(1) # 8-byte Folded Reload
+; LE-NEXT:    mtfprd 0, 3
+; LE-NEXT:    rlwinm 3, 9, 0, 24, 24
+; LE-NEXT:    mulld 3, 10, 3
+; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    rlwinm 4, 9, 0, 23, 23
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 22, 22
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 21, 21
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 20, 20
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 19, 19
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 18, 18
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 17, 17
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 16, 16
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 15, 15
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 14, 14
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 13, 13
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 12, 12
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 11, 11
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 10, 10
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 9, 9
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 8, 8
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 7, 7
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 6, 6
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 5, 5
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 4, 4
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 3, 3
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 2, 2
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 1, 1
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rlwinm 4, 9, 0, 0, 0
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 32, 32
+; LE-NEXT:    rldicl 4, 4, 32, 31
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 31, 33
+; LE-NEXT:    rldicl 4, 4, 33, 30
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 30, 34
+; LE-NEXT:    rldicl 4, 4, 34, 29
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 29, 35
+; LE-NEXT:    rldicl 4, 4, 35, 28
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 28, 36
+; LE-NEXT:    rldicl 4, 4, 36, 27
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 27, 37
+; LE-NEXT:    rldicl 4, 4, 37, 26
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 26, 38
+; LE-NEXT:    rldicl 4, 4, 38, 25
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 25, 39
+; LE-NEXT:    rldicl 4, 4, 39, 24
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 24, 40
+; LE-NEXT:    rldicl 4, 4, 40, 23
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 23, 41
+; LE-NEXT:    rldicl 4, 4, 41, 22
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 22, 42
+; LE-NEXT:    rldicl 4, 4, 42, 21
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 21, 43
+; LE-NEXT:    rldicl 4, 4, 43, 20
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 20, 44
+; LE-NEXT:    rldicl 4, 4, 44, 19
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 19, 45
+; LE-NEXT:    rldicl 4, 4, 45, 18
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 18, 46
+; LE-NEXT:    rldicl 4, 4, 46, 17
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 17, 47
+; LE-NEXT:    rldicl 4, 4, 47, 16
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 16, 48
+; LE-NEXT:    rldicl 4, 4, 48, 15
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 15, 49
+; LE-NEXT:    rldicl 4, 4, 49, 14
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 14, 50
+; LE-NEXT:    rldicl 4, 4, 50, 13
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 13, 51
+; LE-NEXT:    rldicl 4, 4, 51, 12
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 12, 52
+; LE-NEXT:    rldicl 4, 4, 52, 11
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 11, 53
+; LE-NEXT:    rldicl 4, 4, 53, 10
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 10, 54
+; LE-NEXT:    rldicl 4, 4, 54, 9
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 9, 55
+; LE-NEXT:    rldicl 4, 4, 55, 8
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 8, 56
+; LE-NEXT:    rldicl 4, 4, 56, 7
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 7, 57
+; LE-NEXT:    rldicl 4, 4, 57, 6
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 6, 58
+; LE-NEXT:    rldicl 4, 4, 58, 5
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 5, 59
+; LE-NEXT:    rldicl 4, 4, 59, 4
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 4, 60
+; LE-NEXT:    rldicl 4, 4, 60, 3
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 3, 61
+; LE-NEXT:    rldicl 4, 4, 61, 2
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicl 4, 9, 2, 62
+; LE-NEXT:    rldicl 4, 4, 62, 1
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicr 4, 9, 0, 0
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 1
+; LE-NEXT:    rldicl 3, 3, 63, 1
+; LE-NEXT:    and 4, 4, 7
+; LE-NEXT:    and 3, 3, 6
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 2
+; LE-NEXT:    rldicl 3, 3, 62, 2
+; LE-NEXT:    and 4, 4, 11
+; LE-NEXT:    and 3, 3, 8
+; LE-NEXT:    or 3, 3, 4
+; LE-NEXT:    sldi 4, 3, 4
+; LE-NEXT:    rldicl 3, 3, 60, 4
+; LE-NEXT:    and 4, 4, 0
+; LE-NEXT:    and 3, 3, 12
 ; LE-NEXT:    or 3, 3, 4
 ; LE-NEXT:    rldicl 4, 3, 32, 32
 ; LE-NEXT:    rotlwi 5, 4, 24
@@ -9745,8 +9626,9 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    sldi 3, 4, 32
 ; LE-NEXT:    or 3, 3, 5
 ; LE-NEXT:    mtfprd 1, 3
-; LE-NEXT:    xxmrghd 35, 0, 1
+; LE-NEXT:    xxmrghd 35, 1, 0
 ; LE-NEXT:    vsrd 2, 3, 2
+; LE-NEXT:    addi 1, 1, 752
 ; LE-NEXT:    blr
   %a.ext = zext <2 x i64> %a to <2 x i128>
   %b.ext = zext <2 x i64> %b to <2 x i128>
diff --git a/llvm/test/CodeGen/X86/clmul-vector.ll b/llvm/test/CodeGen/X86/clmul-vector.ll
index 8f26f84c01883..8ca41b57072ed 100644
--- a/llvm/test/CodeGen/X86/clmul-vector.ll
+++ b/llvm/test/CodeGen/X86/clmul-vector.ll
@@ -434,97 +434,78 @@ define <8 x i16> @clmul_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX2-LABEL: clmul_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
-; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpxor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vpclmulqdq $0, %xmm2, %xmm3, %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
+; AVX2-NEXT:    vpclmulqdq $0, %xmm5, %xmm6, %xmm5
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX2-NEXT:    vpclmulqdq $17, %xmm2, %xmm3, %xmm5
+; AVX2-NEXT:    vmovq %xmm5, %rax
+; AVX2-NEXT:    vpinsrd $2, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX2-NEXT:    vpclmulqdq $0, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, %rax
+; AVX2-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm2
+; AVX2-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpclmulqdq $0, %xmm4, %xmm5, %xmm4
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT:    vpclmulqdq $17, %xmm1, %xmm0, %xmm4
+; AVX2-NEXT:    vmovq %xmm4, %rax
+; AVX2-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: clmul_v8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpxor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm4
-; AVX512-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm4 = xmm4 ^ xmm2 ^ xmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 ^ xmm4 ^ xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm4
-; AVX512-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm4 = xmm4 ^ xmm3 ^ xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 ^ xmm4 ^ xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm4
-; AVX512-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm4 = xmm4 ^ xmm3 ^ xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 ^ xmm4 ^ xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 ^ xmm3 ^ xmm2
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512-NEXT:    vpclmulqdq $0, %xmm2, %xmm3, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
+; AVX512-NEXT:    vpclmulqdq $0, %xmm5, %xmm6, %xmm5
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX512-NEXT:    vpclmulqdq $17, %xmm2, %xmm3, %xmm5
+; AVX512-NEXT:    vmovq %xmm5, %rax
+; AVX512-NEXT:    vpinsrd $2, %eax, %xmm4, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX512-NEXT:    vpclmulqdq $0, %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm2
+; AVX512-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpclmulqdq $0, %xmm4, %xmm5, %xmm4
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT:    vpclmulqdq $17, %xmm1, %xmm0, %xmm4
+; AVX512-NEXT:    vmovq %xmm4, %rax
+; AVX512-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %res = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i16> %res



More information about the llvm-commits mailing list