[llvm] [SelectionDAG] Use Karatsuba decomposition to expand vector CLMUL via narrower legal types (PR #184468)

via llvm-commits llvm-commits at lists.llvm.org
Wed Mar 4 02:14:59 PST 2026


https://github.com/AbdallahRashed updated https://github.com/llvm/llvm-project/pull/184468

>From f12211ab888801f79070cb2fb842b25480443860 Mon Sep 17 00:00:00 2001
From: AbdallahRashed <abdallah.mrashed at gmail.com>
Date: Sat, 28 Feb 2026 22:57:00 +0100
Subject: [PATCH] [SelectionDAG] Use Karatsuba decomposition to expand vector
 CLMUL via narrower legal types

Reuse the ExpandIntRes_CLMUL Karatsuba identity to expand vector
CLMUL/CLMULR/CLMULH on wider element types (vXi16, vXi32, vXi64) by
decomposing into half-element-width operations that eventually reach a
legal CLMUL type.

Three generic strategies in expandCLMUL:
1. Karatsuba: halve element width (e.g. v8i16 -> v8i8 on AArch64)
2. Element widen: zext to wider type if CLMUL is legal there (e.g. x86)
3. Count widen: pad with undef to double element count (e.g. v4i16 -> v8i16)

A helper canNarrowCLMULToLegal() guides strategy selection and prevents
circular expansion in the CLMULH bitreverse path.

Also add Custom BITREVERSE lowering for v4i16/v8i16 on AArch64 using
REV16+RBIT, which the CLMULH expansion relies on.

Fixes #183768
---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  166 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   16 +
 llvm/test/CodeGen/AArch64/clmul-fixed.ll      | 2533 ++++++-----------
 llvm/test/CodeGen/PowerPC/clmul-vector.ll     |  433 +--
 llvm/test/CodeGen/X86/clmul-vector.ll         |  155 +-
 5 files changed, 1324 insertions(+), 1979 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cc719b1e67f53..ce096cf03293c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8456,6 +8456,53 @@ SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
   return DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal);
 }
 
+/// Check if CLMUL on VT can eventually reach a type with legal CLMUL through
+/// a chain of Karatsuba decompositions (halving element width) and/or vector
+/// widening (doubling element count). This guides expansion strategy selection:
+/// if true, the Karatsuba/widening path produces better code than bit-by-bit.
+///
+/// KaratsubaDepth tracks halving steps only (each creates ~4x more operations).
+/// Widening steps are cheap (O(1) pad/extract) and don't count.
+/// Limiting halvings to 2 prevents exponential blowup:
+///   1 halving: ~4 sub-CLMULs (good, e.g. v8i16 -> v8i8)
+///   2 halvings: ~16 sub-CLMULs (acceptable, e.g. v4i32 -> v4i16 -> v8i8)
+///   3 halvings: ~64 sub-CLMULs (worse than bit-by-bit expansion)
+static bool canNarrowCLMULToLegal(const TargetLowering &TLI, LLVMContext &Ctx,
+                                  EVT VT, unsigned KaratsubaDepth = 0,
+                                  unsigned TotalDepth = 0) {
+  if (KaratsubaDepth > 2 || TotalDepth > 8 || !VT.isVector() ||
+      VT.isScalableVector())
+    return false;
+  if (TLI.isOperationLegalOrCustom(ISD::CLMUL, VT))
+    return true;
+  if (!TLI.isTypeLegal(VT))
+    return false;
+
+  unsigned BW = VT.getScalarSizeInBits();
+
+  // Karatsuba: halve element width, same element count.
+  // This is the expensive step — each halving creates ~4x more operations.
+  if (BW >= 16) {
+    EVT HalfEltVT = EVT::getIntegerVT(Ctx, BW / 2);
+    EVT HalfVT = EVT::getVectorVT(Ctx, HalfEltVT, VT.getVectorElementCount());
+    if (TLI.isTypeLegal(HalfVT) &&
+        canNarrowCLMULToLegal(TLI, Ctx, HalfVT, KaratsubaDepth + 1,
+                              TotalDepth + 1))
+      return true;
+  }
+
+  // Widen: double element count (fixed-width vectors only).
+  // This is cheap — just INSERT_SUBVECTOR + EXTRACT_SUBVECTOR.
+  if (auto EC = VT.getVectorElementCount(); EC.isFixed()) {
+    EVT WideVT = EVT::getVectorVT(Ctx, VT.getVectorElementType(), EC * 2);
+    if (TLI.isTypeLegal(WideVT) &&
+        canNarrowCLMULToLegal(TLI, Ctx, WideVT, KaratsubaDepth, TotalDepth + 1))
+      return true;
+  }
+
+  return false;
+}
+
 SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
   SDLoc DL(Node);
   EVT VT = Node->getValueType(0);
@@ -8463,19 +8510,104 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
   SDValue Y = Node->getOperand(1);
   unsigned BW = VT.getScalarSizeInBits();
   unsigned Opcode = Node->getOpcode();
-
-  // Scalarize if the vector multiplication is unlikely to work.
-  if (VT.isVector() && !isOperationLegalOrCustom(ISD::MUL, VT))
-    return DAG.UnrollVectorOp(Node);
+  LLVMContext &Ctx = *DAG.getContext();
 
   switch (Opcode) {
   case ISD::CLMUL: {
+    // For vector types, try decomposition strategies that leverage legal
+    // CLMUL on narrower or wider element types, avoiding the expensive
+    // bit-by-bit expansion.
+    if (VT.isVector()) {
+      // Strategy 1: Karatsuba decomposition to half-element-width CLMUL.
+      // Applies ExpandIntRes_CLMUL's identity element-wise:
+      //   CLMUL(X, Y) = (Hi << HalfBW) | Lo
+      // where:
+      //   Lo = CLMUL(XLo, YLo)
+      //   Hi = CLMULH(XLo, YLo) ^ CLMUL(XLo, YHi) ^ CLMUL(XHi, YLo)
+      unsigned HalfBW = BW / 2;
+      if (HalfBW >= 8) {
+        EVT HalfEltVT = EVT::getIntegerVT(Ctx, HalfBW);
+        EVT HalfVT =
+            EVT::getVectorVT(Ctx, HalfEltVT, VT.getVectorElementCount());
+        if (isTypeLegal(HalfVT) &&
+            canNarrowCLMULToLegal(*this, Ctx, HalfVT,
+                                  /*KaratsubaDepth=*/1)) {
+          SDValue ShAmt = DAG.getShiftAmountConstant(HalfBW, VT, DL);
+
+          // Extract low and high halves of each element.
+          SDValue XLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, X);
+          SDValue XHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT,
+                                    DAG.getNode(ISD::SRL, DL, VT, X, ShAmt));
+          SDValue YLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Y);
+          SDValue YHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT,
+                                    DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt));
+
+          // Lo = CLMUL(XLo, YLo)
+          SDValue Lo = DAG.getNode(ISD::CLMUL, DL, HalfVT, XLo, YLo);
+
+          // Hi = CLMULH(XLo, YLo) ^ CLMUL(XLo, YHi) ^ CLMUL(XHi, YLo)
+          SDValue LoH = DAG.getNode(ISD::CLMULH, DL, HalfVT, XLo, YLo);
+          SDValue Cross1 = DAG.getNode(ISD::CLMUL, DL, HalfVT, XLo, YHi);
+          SDValue Cross2 = DAG.getNode(ISD::CLMUL, DL, HalfVT, XHi, YLo);
+          SDValue Cross = DAG.getNode(ISD::XOR, DL, HalfVT, Cross1, Cross2);
+          SDValue Hi = DAG.getNode(ISD::XOR, DL, HalfVT, LoH, Cross);
+
+          // Reassemble: Result = ZExt(Lo) | (ZExt(Hi) << HalfBW)
+          SDValue LoExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo);
+          SDValue HiExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi);
+          SDValue HiShifted = DAG.getNode(ISD::SHL, DL, VT, HiExt, ShAmt);
+          return DAG.getNode(ISD::OR, DL, VT, LoExt, HiShifted);
+        }
+      }
+
+      // Strategy 2: Widen to double-element-width CLMUL.
+      // CLMUL(X, Y) = Trunc(CLMUL(ZExt(X), ZExt(Y)))
+      {
+        EVT ExtVT = VT.changeElementType(Ctx, EVT::getIntegerVT(Ctx, 2 * BW));
+        if (isTypeLegal(ExtVT) && isOperationLegalOrCustom(ISD::CLMUL, ExtVT) &&
+            isOperationLegalOrCustom(ISD::ZERO_EXTEND, ExtVT)) {
+          // If CLMUL on ExtVT is Custom (not Legal), the target may
+          // scalarize it, costing O(NumElements) scalar ops. The bit-by-bit
+          // fallback costs O(BW) vectorized iterations. Only widen when
+          // element count is small enough that scalarization is cheaper.
+          unsigned NumElts = VT.getVectorMinNumElements();
+          if (isOperationLegal(ISD::CLMUL, ExtVT) || NumElts < BW) {
+            SDValue XExt = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, X);
+            SDValue YExt = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Y);
+            SDValue Mul = DAG.getNode(ISD::CLMUL, DL, ExtVT, XExt, YExt);
+            return DAG.getNode(ISD::TRUNCATE, DL, VT, Mul);
+          }
+        }
+      }
+
+      // Strategy 3: Widen element count (pad with undef, do CLMUL on wider
+      // vector, extract lower result). CLMUL is element-wise, so upper
+      // (undef) lanes don't affect the lower results.
+      // e.g. v4i16 → pad to v8i16 → Karatsuba to v8i8 PMUL → extract v4i16.
+      if (auto EC = VT.getVectorElementCount(); EC.isFixed()) {
+        EVT WideVT = EVT::getVectorVT(Ctx, VT.getVectorElementType(), EC * 2);
+        if (isTypeLegal(WideVT) && canNarrowCLMULToLegal(*this, Ctx, WideVT)) {
+          SDValue Undef = DAG.getUNDEF(WideVT);
+          SDValue XWide = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, Undef,
+                                      X, DAG.getVectorIdxConstant(0, DL));
+          SDValue YWide = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, Undef,
+                                      Y, DAG.getVectorIdxConstant(0, DL));
+          SDValue WideRes = DAG.getNode(ISD::CLMUL, DL, WideVT, XWide, YWide);
+          return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WideRes,
+                             DAG.getVectorIdxConstant(0, DL));
+        }
+      }
+    }
+
+    // Scalarize if the vector multiplication is unlikely to work.
+    if (VT.isVector() && !isOperationLegalOrCustom(ISD::MUL, VT))
+      return DAG.UnrollVectorOp(Node);
+
     // NOTE: If you change this expansion, please update the cost model
     // calculation in BasicTTIImpl::getTypeBasedIntrinsicInstrCost for
     // Intrinsic::clmul.
 
-    EVT SetCCVT =
-        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), Ctx, VT);
 
     SDValue Res = DAG.getConstant(0, DL, VT);
     for (unsigned I = 0; I < BW; ++I) {
@@ -8488,8 +8620,7 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
       // instructions.
       SDValue Part;
       if (!hasBitTest(Y, ShiftAmt) &&
-          isOperationLegalOrCustom(
-              ISD::MUL, getTypeToTransformTo(*DAG.getContext(), VT))) {
+          isOperationLegalOrCustom(ISD::MUL, getTypeToTransformTo(Ctx, VT))) {
         Part = DAG.getNode(ISD::MUL, DL, VT, X, YMasked);
       } else {
         // Canonical bit test: (Y & (1 << I)) != 0
@@ -8516,17 +8647,20 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
     }
     [[fallthrough]];
   case ISD::CLMULH: {
-    EVT ExtVT = VT.changeElementType(
-        *DAG.getContext(), EVT::getIntegerVT(*DAG.getContext(), 2 * BW));
-    // For example, ExtVT = i64 based operations aren't legal on a 32-bit
-    // target; use bitreverse-based lowering in this case.
-    // Also prefer bitreverse-based lowering when CLMUL is legal on VT but
-    // not on ExtVT, to avoid expanding CLMUL on the wider type (e.g. v8i8
-    // on AArch64 where CLMUL v8i8 is legal via PMUL but CLMUL v8i16 is not).
+    EVT ExtVT = VT.changeElementType(Ctx, EVT::getIntegerVT(Ctx, 2 * BW));
+    // Use bitreverse-based lowering (CLMULR/H = rev(CLMUL(rev,rev)) >> S)
+    // when any of these hold:
+    // (a) ZERO_EXTEND to ExtVT or SRL on ExtVT isn't legal.
+    // (b) CLMUL is legal on VT but not on ExtVT (e.g. v8i8 on AArch64).
+    // (c) CLMUL on VT can be efficiently expanded via Karatsuba/widening
+    //     to reach legal CLMUL. The bitreverse path creates CLMUL(VT) which
+    //     will be expanded efficiently. The widening path would create
+    //     CLMUL(ExtVT) → Karatsuba → CLMULH(VT), causing a cycle.
     if (!isOperationLegalOrCustom(ISD::ZERO_EXTEND, ExtVT) ||
         !isOperationLegalOrCustom(ISD::SRL, ExtVT) ||
         (!isOperationLegalOrCustom(ISD::CLMUL, ExtVT) &&
-         isOperationLegalOrCustom(ISD::CLMUL, VT))) {
+         isOperationLegalOrCustom(ISD::CLMUL, VT)) ||
+        canNarrowCLMULToLegal(*this, Ctx, VT)) {
       SDValue XRev = DAG.getNode(ISD::BITREVERSE, DL, VT, X);
       SDValue YRev = DAG.getNode(ISD::BITREVERSE, DL, VT, Y);
       SDValue ClMul = DAG.getNode(ISD::CLMUL, DL, VT, XRev, YRev);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2cd78493d2c23..b7d186d83c92e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1329,6 +1329,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::CTLS, VT, Legal);
     setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
     setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
+    setOperationAction(ISD::BITREVERSE, MVT::v4i16, Custom);
+    setOperationAction(ISD::BITREVERSE, MVT::v8i16, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
     setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
@@ -11960,6 +11962,20 @@ SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
   default:
     llvm_unreachable("Invalid type for bitreverse!");
 
+  case MVT::v4i16: {
+    VST = MVT::v8i8;
+    REVB = DAG.getNode(AArch64ISD::REV16, DL, VST, Op.getOperand(0));
+
+    break;
+  }
+
+  case MVT::v8i16: {
+    VST = MVT::v16i8;
+    REVB = DAG.getNode(AArch64ISD::REV16, DL, VST, Op.getOperand(0));
+
+    break;
+  }
+
   case MVT::v2i32: {
     VST = MVT::v8i8;
     REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
index 23692dc456fc2..46ad7d9bbc295 100644
--- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
@@ -23,69 +23,23 @@ define <8 x i8> @clmul_v8i8_neon(<8 x i8> %x, <8 x i8> %y) {
 define <8 x i16> @clmul_v8i16_neon(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-LABEL: clmul_v8i16_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.8h, #2
-; CHECK-NEXT:    movi v3.8h, #1
-; CHECK-NEXT:    movi v4.8h, #4
-; CHECK-NEXT:    movi v5.8h, #8
-; CHECK-NEXT:    movi v6.8h, #16
-; CHECK-NEXT:    movi v7.8h, #32
-; CHECK-NEXT:    movi v16.8h, #128
-; CHECK-NEXT:    movi v17.8h, #1, lsl #8
-; CHECK-NEXT:    movi v18.8h, #8, lsl #8
-; CHECK-NEXT:    movi v19.8h, #16, lsl #8
-; CHECK-NEXT:    movi v20.8h, #64
-; CHECK-NEXT:    movi v21.8h, #2, lsl #8
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    movi v22.8h, #32, lsl #8
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    mul v2.8h, v0.8h, v2.8h
-; CHECK-NEXT:    mul v3.8h, v0.8h, v3.8h
-; CHECK-NEXT:    mul v4.8h, v0.8h, v4.8h
-; CHECK-NEXT:    mul v5.8h, v0.8h, v5.8h
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    movi v23.8h, #4, lsl #8
-; CHECK-NEXT:    movi v24.8h, #64, lsl #8
-; CHECK-NEXT:    mul v6.8h, v0.8h, v6.8h
-; CHECK-NEXT:    mul v7.8h, v0.8h, v7.8h
-; CHECK-NEXT:    mul v16.8h, v0.8h, v16.8h
-; CHECK-NEXT:    mul v17.8h, v0.8h, v17.8h
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    mul v18.8h, v0.8h, v18.8h
-; CHECK-NEXT:    mul v19.8h, v0.8h, v19.8h
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    mul v4.8h, v0.8h, v20.8h
-; CHECK-NEXT:    movi v20.8h, #128, lsl #8
-; CHECK-NEXT:    mul v5.8h, v0.8h, v21.8h
-; CHECK-NEXT:    and v21.16b, v1.16b, v23.16b
-; CHECK-NEXT:    and v23.16b, v1.16b, v24.16b
-; CHECK-NEXT:    mul v22.8h, v0.8h, v22.8h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v20.16b
-; CHECK-NEXT:    mul v3.8h, v0.8h, v21.8h
-; CHECK-NEXT:    mul v17.8h, v0.8h, v23.8h
-; CHECK-NEXT:    eor v4.16b, v6.16b, v4.16b
-; CHECK-NEXT:    eor v5.16b, v7.16b, v5.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v22.16b
-; CHECK-NEXT:    mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v4.16b
-; CHECK-NEXT:    eor v2.16b, v5.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v6.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    ret
   %a = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %x, <8 x i16> %y)
   ret <8 x i16> %a
@@ -94,69 +48,26 @@ define <8 x i16> @clmul_v8i16_neon(<8 x i16> %x, <8 x i16> %y) {
 define <4 x i16> @clmul_v4i16_neon(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: clmul_v4i16_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4h, #2
-; CHECK-NEXT:    movi v3.4h, #1
-; CHECK-NEXT:    movi v4.4h, #4
-; CHECK-NEXT:    movi v5.4h, #8
-; CHECK-NEXT:    movi v6.4h, #16
-; CHECK-NEXT:    movi v7.4h, #32
-; CHECK-NEXT:    movi v16.4h, #128
-; CHECK-NEXT:    movi v17.4h, #1, lsl #8
-; CHECK-NEXT:    movi v18.4h, #8, lsl #8
-; CHECK-NEXT:    movi v19.4h, #16, lsl #8
-; CHECK-NEXT:    movi v20.4h, #64
-; CHECK-NEXT:    movi v21.4h, #2, lsl #8
-; CHECK-NEXT:    and v2.8b, v1.8b, v2.8b
-; CHECK-NEXT:    and v3.8b, v1.8b, v3.8b
-; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT:    and v5.8b, v1.8b, v5.8b
-; CHECK-NEXT:    movi v22.4h, #32, lsl #8
-; CHECK-NEXT:    and v6.8b, v1.8b, v6.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v7.8b
-; CHECK-NEXT:    and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT:    and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT:    and v18.8b, v1.8b, v18.8b
-; CHECK-NEXT:    and v19.8b, v1.8b, v19.8b
-; CHECK-NEXT:    mul v2.4h, v0.4h, v2.4h
-; CHECK-NEXT:    mul v3.4h, v0.4h, v3.4h
-; CHECK-NEXT:    mul v4.4h, v0.4h, v4.4h
-; CHECK-NEXT:    mul v5.4h, v0.4h, v5.4h
-; CHECK-NEXT:    and v20.8b, v1.8b, v20.8b
-; CHECK-NEXT:    movi v23.4h, #4, lsl #8
-; CHECK-NEXT:    movi v24.4h, #64, lsl #8
-; CHECK-NEXT:    mul v6.4h, v0.4h, v6.4h
-; CHECK-NEXT:    mul v7.4h, v0.4h, v7.4h
-; CHECK-NEXT:    mul v16.4h, v0.4h, v16.4h
-; CHECK-NEXT:    mul v17.4h, v0.4h, v17.4h
-; CHECK-NEXT:    and v21.8b, v1.8b, v21.8b
-; CHECK-NEXT:    mul v18.4h, v0.4h, v18.4h
-; CHECK-NEXT:    mul v19.4h, v0.4h, v19.4h
-; CHECK-NEXT:    and v22.8b, v1.8b, v22.8b
-; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
-; CHECK-NEXT:    eor v3.8b, v4.8b, v5.8b
-; CHECK-NEXT:    mul v4.4h, v0.4h, v20.4h
-; CHECK-NEXT:    movi v20.4h, #128, lsl #8
-; CHECK-NEXT:    mul v5.4h, v0.4h, v21.4h
-; CHECK-NEXT:    and v21.8b, v1.8b, v23.8b
-; CHECK-NEXT:    and v23.8b, v1.8b, v24.8b
-; CHECK-NEXT:    mul v22.4h, v0.4h, v22.4h
-; CHECK-NEXT:    eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT:    eor v7.8b, v16.8b, v17.8b
-; CHECK-NEXT:    eor v16.8b, v18.8b, v19.8b
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v20.8b
-; CHECK-NEXT:    mul v3.4h, v0.4h, v21.4h
-; CHECK-NEXT:    mul v17.4h, v0.4h, v23.4h
-; CHECK-NEXT:    eor v4.8b, v6.8b, v4.8b
-; CHECK-NEXT:    eor v5.8b, v7.8b, v5.8b
-; CHECK-NEXT:    eor v6.8b, v16.8b, v22.8b
-; CHECK-NEXT:    mul v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    eor v1.8b, v2.8b, v4.8b
-; CHECK-NEXT:    eor v2.8b, v5.8b, v3.8b
-; CHECK-NEXT:    eor v3.8b, v6.8b, v17.8b
-; CHECK-NEXT:    eor v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v3.8b, v0.8b
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
 ; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; CHECK-NEXT:    ret
   %a = call <4 x i16> @llvm.clmul.v4i16(<4 x i16> %x, <4 x i16> %y)
   ret <4 x i16> %a
@@ -165,269 +76,184 @@ define <4 x i16> @clmul_v4i16_neon(<4 x i16> %x, <4 x i16> %y) {
 define <4 x i32> @clmul_v4i32_neon(<4 x i32> %x, <4 x i32> %y) {
 ; CHECK-LABEL: clmul_v4i32_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #2
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    movi v4.4s, #4
-; CHECK-NEXT:    movi v5.4s, #8
-; CHECK-NEXT:    movi v6.4s, #16
-; CHECK-NEXT:    movi v7.4s, #32
-; CHECK-NEXT:    movi v16.4s, #64
-; CHECK-NEXT:    movi v17.4s, #128
-; CHECK-NEXT:    movi v18.4s, #1, lsl #8
-; CHECK-NEXT:    movi v19.4s, #2, lsl #8
-; CHECK-NEXT:    movi v20.4s, #8, lsl #8
-; CHECK-NEXT:    movi v21.4s, #128, lsl #16
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    mul v2.4s, v0.4s, v2.4s
-; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mul v5.4s, v0.4s, v5.4s
-; CHECK-NEXT:    mul v6.4s, v0.4s, v6.4s
-; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    movi v22.4s, #8, lsl #16
-; CHECK-NEXT:    movi v23.4s, #2, lsl #24
-; CHECK-NEXT:    movi v25.4s, #4, lsl #24
-; CHECK-NEXT:    movi v24.4s, #32, lsl #16
-; CHECK-NEXT:    movi v26.4s, #8, lsl #24
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    movi v4.4s, #16, lsl #8
-; CHECK-NEXT:    mul v5.4s, v0.4s, v16.4s
-; CHECK-NEXT:    mul v16.4s, v0.4s, v17.4s
-; CHECK-NEXT:    mul v17.4s, v0.4s, v18.4s
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v19.16b
-; CHECK-NEXT:    movi v19.4s, #32, lsl #8
-; CHECK-NEXT:    and v18.16b, v1.16b, v20.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    movi v20.4s, #64, lsl #8
-; CHECK-NEXT:    mul v21.4s, v0.4s, v21.4s
-; CHECK-NEXT:    and v3.16b, v1.16b, v4.16b
-; CHECK-NEXT:    eor v5.16b, v6.16b, v5.16b
-; CHECK-NEXT:    movi v4.4s, #1, lsl #16
-; CHECK-NEXT:    eor v6.16b, v16.16b, v17.16b
-; CHECK-NEXT:    movi v16.4s, #2, lsl #16
-; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT:    mul v18.4s, v0.4s, v18.4s
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    movi v17.4s, #4, lsl #8
-; CHECK-NEXT:    mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v16.16b
-; CHECK-NEXT:    movi v16.4s, #64, lsl #16
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    mul v7.4s, v0.4s, v19.4s
-; CHECK-NEXT:    movi v19.4s, #4, lsl #16
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    eor v3.16b, v18.16b, v3.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v20.16b
-; CHECK-NEXT:    movi v20.4s, #1, lsl #24
-; CHECK-NEXT:    mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT:    mul v5.4s, v0.4s, v5.4s
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    mul v17.4s, v0.4s, v17.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v19.16b
-; CHECK-NEXT:    mul v18.4s, v0.4s, v18.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    movi v19.4s, #128, lsl #8
-; CHECK-NEXT:    mul v16.4s, v0.4s, v16.4s
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    mul v5.4s, v0.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v1.16b, v22.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #16
-; CHECK-NEXT:    mul v20.4s, v0.4s, v20.4s
-; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v18.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v19.16b
-; CHECK-NEXT:    mul v18.4s, v0.4s, v23.4s
-; CHECK-NEXT:    and v19.16b, v1.16b, v25.16b
-; CHECK-NEXT:    eor v16.16b, v16.16b, v21.16b
-; CHECK-NEXT:    and v21.16b, v1.16b, v24.16b
-; CHECK-NEXT:    movi v23.4s, #32, lsl #24
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    mul v5.4s, v0.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v1.16b, v22.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #24
-; CHECK-NEXT:    movi v24.4s, #64, lsl #24
-; CHECK-NEXT:    mul v17.4s, v0.4s, v17.4s
-; CHECK-NEXT:    eor v16.16b, v16.16b, v20.16b
-; CHECK-NEXT:    and v20.16b, v1.16b, v26.16b
-; CHECK-NEXT:    mul v19.4s, v0.4s, v19.4s
-; CHECK-NEXT:    mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v6.16b
-; CHECK-NEXT:    mul v6.4s, v0.4s, v21.4s
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v21.16b, v1.16b, v23.16b
-; CHECK-NEXT:    eor v5.16b, v16.16b, v18.16b
-; CHECK-NEXT:    movi v16.4s, #128, lsl #24
-; CHECK-NEXT:    mul v18.4s, v0.4s, v20.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v22.16b
-; CHECK-NEXT:    and v22.16b, v1.16b, v24.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v17.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v7.16b
-; CHECK-NEXT:    eor v5.16b, v5.16b, v19.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v16.16b
-; CHECK-NEXT:    mul v7.4s, v0.4s, v20.4s
-; CHECK-NEXT:    mul v16.4s, v0.4s, v21.4s
-; CHECK-NEXT:    mul v17.4s, v0.4s, v22.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v6.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v18.16b
-; CHECK-NEXT:    mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    xtn v2.4h, v1.4s
+; CHECK-NEXT:    xtn v3.4h, v0.4s
+; CHECK-NEXT:    shrn v16.4h, v0.4s, #16
+; CHECK-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEXT:    rev16 v4.8b, v2.8b
+; CHECK-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEXT:    xtn v1.8b, v3.8h
+; CHECK-NEXT:    shrn v2.8b, v2.8h, #8
+; CHECK-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEXT:    rbit v22.8b, v0.8b
+; CHECK-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEXT:    rbit v24.8b, v1.8b
+; CHECK-NEXT:    pmul v16.8b, v16.8b, v0.8b
+; CHECK-NEXT:    pmul v25.8b, v20.8b, v2.8b
+; CHECK-NEXT:    pmul v17.8b, v1.8b, v17.8b
+; CHECK-NEXT:    pmul v2.8b, v1.8b, v2.8b
+; CHECK-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEXT:    pmul v3.8b, v3.8b, v0.8b
+; CHECK-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEXT:    pmul v18.8b, v20.8b, v0.8b
+; CHECK-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEXT:    pmul v16.8b, v1.8b, v21.8b
+; CHECK-NEXT:    pmul v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    eor v1.8b, v3.8b, v2.8b
+; CHECK-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEXT:    shll v1.8h, v1.8b, #8
+; CHECK-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %a = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %x, <4 x i32> %y)
   ret <4 x i32> %a
 }
 
 define <2 x i32> @clmul_v2i32_neon(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: clmul_v2i32_neon:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.2s, #2
-; CHECK-NEXT:    movi v3.2s, #1
-; CHECK-NEXT:    movi v4.2s, #4
-; CHECK-NEXT:    movi v5.2s, #8
-; CHECK-NEXT:    movi v6.2s, #16
-; CHECK-NEXT:    movi v7.2s, #32
-; CHECK-NEXT:    movi v16.2s, #64
-; CHECK-NEXT:    movi v17.2s, #128
-; CHECK-NEXT:    movi v18.2s, #1, lsl #8
-; CHECK-NEXT:    movi v19.2s, #2, lsl #8
-; CHECK-NEXT:    movi v20.2s, #8, lsl #8
-; CHECK-NEXT:    movi v21.2s, #128, lsl #16
-; CHECK-NEXT:    and v2.8b, v1.8b, v2.8b
-; CHECK-NEXT:    and v3.8b, v1.8b, v3.8b
-; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT:    and v5.8b, v1.8b, v5.8b
-; CHECK-NEXT:    and v6.8b, v1.8b, v6.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v7.8b
-; CHECK-NEXT:    and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT:    and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT:    and v18.8b, v1.8b, v18.8b
-; CHECK-NEXT:    mul v2.2s, v0.2s, v2.2s
-; CHECK-NEXT:    mul v3.2s, v0.2s, v3.2s
-; CHECK-NEXT:    mul v4.2s, v0.2s, v4.2s
-; CHECK-NEXT:    mul v5.2s, v0.2s, v5.2s
-; CHECK-NEXT:    mul v6.2s, v0.2s, v6.2s
-; CHECK-NEXT:    mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT:    and v21.8b, v1.8b, v21.8b
-; CHECK-NEXT:    movi v22.2s, #8, lsl #16
-; CHECK-NEXT:    movi v23.2s, #2, lsl #24
-; CHECK-NEXT:    movi v25.2s, #4, lsl #24
-; CHECK-NEXT:    movi v24.2s, #32, lsl #16
-; CHECK-NEXT:    movi v26.2s, #8, lsl #24
-; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
-; CHECK-NEXT:    eor v3.8b, v4.8b, v5.8b
-; CHECK-NEXT:    movi v4.2s, #16, lsl #8
-; CHECK-NEXT:    mul v5.2s, v0.2s, v16.2s
-; CHECK-NEXT:    mul v16.2s, v0.2s, v17.2s
-; CHECK-NEXT:    mul v17.2s, v0.2s, v18.2s
-; CHECK-NEXT:    eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v19.8b
-; CHECK-NEXT:    movi v19.2s, #32, lsl #8
-; CHECK-NEXT:    and v18.8b, v1.8b, v20.8b
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    movi v20.2s, #64, lsl #8
-; CHECK-NEXT:    mul v21.2s, v0.2s, v21.2s
-; CHECK-NEXT:    and v3.8b, v1.8b, v4.8b
-; CHECK-NEXT:    eor v5.8b, v6.8b, v5.8b
-; CHECK-NEXT:    movi v4.2s, #1, lsl #16
-; CHECK-NEXT:    eor v6.8b, v16.8b, v17.8b
-; CHECK-NEXT:    movi v16.2s, #2, lsl #16
-; CHECK-NEXT:    mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT:    mul v18.2s, v0.2s, v18.2s
-; CHECK-NEXT:    and v19.8b, v1.8b, v19.8b
-; CHECK-NEXT:    movi v17.2s, #4, lsl #8
-; CHECK-NEXT:    mul v3.2s, v0.2s, v3.2s
-; CHECK-NEXT:    eor v2.8b, v2.8b, v5.8b
-; CHECK-NEXT:    and v23.8b, v1.8b, v23.8b
-; CHECK-NEXT:    and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT:    and v5.8b, v1.8b, v16.8b
-; CHECK-NEXT:    movi v16.2s, #64, lsl #16
-; CHECK-NEXT:    eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT:    mul v7.2s, v0.2s, v19.2s
-; CHECK-NEXT:    movi v19.2s, #4, lsl #16
-; CHECK-NEXT:    and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT:    eor v3.8b, v18.8b, v3.8b
-; CHECK-NEXT:    and v18.8b, v1.8b, v20.8b
-; CHECK-NEXT:    movi v20.2s, #1, lsl #24
-; CHECK-NEXT:    mul v4.2s, v0.2s, v4.2s
-; CHECK-NEXT:    mul v5.2s, v0.2s, v5.2s
-; CHECK-NEXT:    and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT:    mul v17.2s, v0.2s, v17.2s
-; CHECK-NEXT:    eor v3.8b, v3.8b, v7.8b
-; CHECK-NEXT:    and v7.8b, v1.8b, v19.8b
-; CHECK-NEXT:    mul v18.2s, v0.2s, v18.2s
-; CHECK-NEXT:    and v20.8b, v1.8b, v20.8b
-; CHECK-NEXT:    movi v19.2s, #128, lsl #8
-; CHECK-NEXT:    mul v16.2s, v0.2s, v16.2s
-; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT:    mul v5.2s, v0.2s, v7.2s
-; CHECK-NEXT:    and v7.8b, v1.8b, v22.8b
-; CHECK-NEXT:    movi v22.2s, #16, lsl #16
-; CHECK-NEXT:    mul v20.2s, v0.2s, v20.2s
-; CHECK-NEXT:    eor v6.8b, v6.8b, v17.8b
-; CHECK-NEXT:    eor v3.8b, v3.8b, v18.8b
-; CHECK-NEXT:    and v17.8b, v1.8b, v19.8b
-; CHECK-NEXT:    mul v18.2s, v0.2s, v23.2s
-; CHECK-NEXT:    and v19.8b, v1.8b, v25.8b
-; CHECK-NEXT:    eor v16.8b, v16.8b, v21.8b
-; CHECK-NEXT:    and v21.8b, v1.8b, v24.8b
-; CHECK-NEXT:    movi v23.2s, #32, lsl #24
-; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT:    mul v5.2s, v0.2s, v7.2s
-; CHECK-NEXT:    and v7.8b, v1.8b, v22.8b
-; CHECK-NEXT:    movi v22.2s, #16, lsl #24
-; CHECK-NEXT:    movi v24.2s, #64, lsl #24
-; CHECK-NEXT:    mul v17.2s, v0.2s, v17.2s
-; CHECK-NEXT:    eor v16.8b, v16.8b, v20.8b
-; CHECK-NEXT:    and v20.8b, v1.8b, v26.8b
-; CHECK-NEXT:    mul v19.2s, v0.2s, v19.2s
-; CHECK-NEXT:    mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT:    eor v2.8b, v2.8b, v6.8b
-; CHECK-NEXT:    mul v6.2s, v0.2s, v21.2s
-; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT:    and v21.8b, v1.8b, v23.8b
-; CHECK-NEXT:    eor v5.8b, v16.8b, v18.8b
-; CHECK-NEXT:    movi v16.2s, #128, lsl #24
-; CHECK-NEXT:    mul v18.2s, v0.2s, v20.2s
-; CHECK-NEXT:    and v20.8b, v1.8b, v22.8b
-; CHECK-NEXT:    and v22.8b, v1.8b, v24.8b
-; CHECK-NEXT:    eor v3.8b, v3.8b, v17.8b
-; CHECK-NEXT:    eor v4.8b, v4.8b, v7.8b
-; CHECK-NEXT:    eor v5.8b, v5.8b, v19.8b
-; CHECK-NEXT:    and v1.8b, v1.8b, v16.8b
-; CHECK-NEXT:    mul v7.2s, v0.2s, v20.2s
-; CHECK-NEXT:    mul v16.2s, v0.2s, v21.2s
-; CHECK-NEXT:    mul v17.2s, v0.2s, v22.2s
-; CHECK-NEXT:    eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT:    eor v3.8b, v4.8b, v6.8b
-; CHECK-NEXT:    eor v4.8b, v5.8b, v18.8b
-; CHECK-NEXT:    mul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT:    eor v1.8b, v2.8b, v3.8b
-; CHECK-NEXT:    eor v2.8b, v4.8b, v7.8b
-; CHECK-NEXT:    eor v3.8b, v16.8b, v17.8b
-; CHECK-NEXT:    eor v1.8b, v1.8b, v2.8b
-; CHECK-NEXT:    eor v0.8b, v3.8b, v0.8b
-; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT:    ret
+; CHECK-NEON-LABEL: clmul_v2i32_neon:
+; CHECK-NEON:       // %bb.0:
+; CHECK-NEON-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEON-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEON-NEXT:    xtn v2.4h, v1.4s
+; CHECK-NEON-NEXT:    xtn v3.4h, v0.4s
+; CHECK-NEON-NEXT:    shrn v16.4h, v0.4s, #16
+; CHECK-NEON-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEON-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEON-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEON-NEXT:    rev16 v4.8b, v2.8b
+; CHECK-NEON-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEON-NEXT:    xtn v0.8b, v2.8h
+; CHECK-NEON-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEON-NEXT:    xtn v1.8b, v3.8h
+; CHECK-NEON-NEXT:    shrn v2.8b, v2.8h, #8
+; CHECK-NEON-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEON-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEON-NEXT:    rbit v22.8b, v0.8b
+; CHECK-NEON-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEON-NEXT:    rbit v24.8b, v1.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v16.8b, v0.8b
+; CHECK-NEON-NEXT:    pmul v25.8b, v20.8b, v2.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v1.8b, v17.8b
+; CHECK-NEON-NEXT:    pmul v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEON-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEON-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEON-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEON-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEON-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEON-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEON-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEON-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v3.8b, v3.8b, v0.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEON-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    eor v2.8b, v2.8b, v3.8b
+; CHECK-NEON-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEON-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v20.8b, v0.8b
+; CHECK-NEON-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v1.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v0.8b, v1.8b, v0.8b
+; CHECK-NEON-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEON-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEON-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEON-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEON-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEON-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEON-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEON-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEON-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEON-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v1.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT:    shll v1.8h, v1.8b, #8
+; CHECK-NEON-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEON-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEON-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEON-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEON-NEXT:    ret
+;
+; CHECK-AES-LABEL: clmul_v2i32_neon:
+; CHECK-AES:       // %bb.0:
+; CHECK-AES-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-AES-NEXT:    ushll v0.2d, v0.2s, #0
+; CHECK-AES-NEXT:    pmull2 v2.1q, v0.2d, v1.2d
+; CHECK-AES-NEXT:    pmull v0.1q, v0.1d, v1.1d
+; CHECK-AES-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-AES-NEXT:    xtn v0.2s, v0.2d
+; CHECK-AES-NEXT:    ret
   %a = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %x, <2 x i32> %y)
   ret <2 x i32> %a
 }
@@ -1730,45 +1556,15 @@ define <1 x i128> @clmul_v1i128_neon(<1 x i128> %x, <1 x i128> %y) {
 define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-LABEL: clmul_v8i16_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    mov v2.16b, v1.16b
-; CHECK-NEXT:    mov v3.16b, v1.16b
-; CHECK-NEXT:    mov v4.16b, v1.16b
-; CHECK-NEXT:    mov v5.16b, v1.16b
-; CHECK-NEXT:    mov v6.16b, v1.16b
-; CHECK-NEXT:    mov v7.16b, v1.16b
-; CHECK-NEXT:    mov v16.16b, v1.16b
-; CHECK-NEXT:    bic v1.8h, #127
-; CHECK-NEXT:    bic v2.8h, #253
-; CHECK-NEXT:    bic v3.8h, #254
-; CHECK-NEXT:    bic v4.8h, #251
-; CHECK-NEXT:    bic v5.8h, #247
-; CHECK-NEXT:    bic v6.8h, #239
-; CHECK-NEXT:    bic v7.8h, #223
-; CHECK-NEXT:    bic v16.8h, #191
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    xtn v2.8b, v2.8h
-; CHECK-NEXT:    xtn v3.8b, v3.8h
-; CHECK-NEXT:    xtn v4.8b, v4.8h
-; CHECK-NEXT:    xtn v5.8b, v5.8h
-; CHECK-NEXT:    xtn v6.8b, v6.8h
-; CHECK-NEXT:    xtn v7.8b, v7.8h
-; CHECK-NEXT:    xtn v16.8b, v16.8h
-; CHECK-NEXT:    umull v2.8h, v0.8b, v2.8b
-; CHECK-NEXT:    umull v3.8h, v0.8b, v3.8b
-; CHECK-NEXT:    umull v4.8h, v0.8b, v4.8b
-; CHECK-NEXT:    umull v5.8h, v0.8b, v5.8b
-; CHECK-NEXT:    umull v6.8h, v0.8b, v6.8b
-; CHECK-NEXT:    umull v7.8h, v0.8b, v7.8b
-; CHECK-NEXT:    umull v16.8h, v0.8b, v16.8b
-; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v4.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v16.16b
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rbit v2.8b, v1.8b
+; CHECK-NEXT:    rbit v3.8b, v0.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    rbit v2.8b, v2.8b
+; CHECK-NEXT:    ushr v1.8b, v2.8b, #1
+; CHECK-NEXT:    shll v1.8h, v1.8b, #8
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %zextx = zext <8 x i8> %x to <8 x i16>
   %zexty = zext <8 x i8> %y to <8 x i16>
@@ -1779,84 +1575,26 @@ define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
 define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) {
 ; CHECK-LABEL: clmul_v16i16_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushll2 v2.8h, v1.16b, #0
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    rbit v4.8b, v1.8b
+; CHECK-NEXT:    rbit v5.8b, v0.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    rbit v6.8b, v3.8b
+; CHECK-NEXT:    rbit v7.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v2.8b, v3.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    pmul v5.8b, v7.8b, v6.8b
 ; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
-; CHECK-NEXT:    mov v4.16b, v2.16b
-; CHECK-NEXT:    mov v5.16b, v2.16b
-; CHECK-NEXT:    mov v6.16b, v2.16b
-; CHECK-NEXT:    mov v7.16b, v2.16b
-; CHECK-NEXT:    mov v16.16b, v2.16b
-; CHECK-NEXT:    mov v17.16b, v2.16b
-; CHECK-NEXT:    mov v18.16b, v1.16b
-; CHECK-NEXT:    mov v19.16b, v1.16b
-; CHECK-NEXT:    mov v20.16b, v1.16b
-; CHECK-NEXT:    mov v21.16b, v1.16b
-; CHECK-NEXT:    mov v22.16b, v1.16b
-; CHECK-NEXT:    mov v23.16b, v1.16b
-; CHECK-NEXT:    bic v4.8h, #253
-; CHECK-NEXT:    bic v5.8h, #254
-; CHECK-NEXT:    bic v6.8h, #251
-; CHECK-NEXT:    bic v7.8h, #247
-; CHECK-NEXT:    mov v3.16b, v2.16b
-; CHECK-NEXT:    bic v16.8h, #239
-; CHECK-NEXT:    bic v17.8h, #223
-; CHECK-NEXT:    bic v18.8h, #253
-; CHECK-NEXT:    bic v19.8h, #254
-; CHECK-NEXT:    bic v20.8h, #251
-; CHECK-NEXT:    bic v21.8h, #247
-; CHECK-NEXT:    bic v22.8h, #239
-; CHECK-NEXT:    bic v23.8h, #223
-; CHECK-NEXT:    mov v24.16b, v1.16b
-; CHECK-NEXT:    uzp1 v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    uzp1 v5.16b, v0.16b, v5.16b
-; CHECK-NEXT:    uzp1 v6.16b, v0.16b, v6.16b
-; CHECK-NEXT:    uzp1 v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    bic v3.8h, #191
-; CHECK-NEXT:    uzp1 v16.16b, v0.16b, v16.16b
-; CHECK-NEXT:    uzp1 v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    xtn v18.8b, v18.8h
-; CHECK-NEXT:    xtn v19.8b, v19.8h
-; CHECK-NEXT:    xtn v20.8b, v20.8h
-; CHECK-NEXT:    xtn v21.8b, v21.8h
-; CHECK-NEXT:    xtn v22.8b, v22.8h
-; CHECK-NEXT:    xtn v23.8b, v23.8h
-; CHECK-NEXT:    bic v24.8h, #191
-; CHECK-NEXT:    umull2 v4.8h, v0.16b, v4.16b
-; CHECK-NEXT:    umull2 v5.8h, v0.16b, v5.16b
-; CHECK-NEXT:    umull2 v6.8h, v0.16b, v6.16b
-; CHECK-NEXT:    umull2 v7.8h, v0.16b, v7.16b
-; CHECK-NEXT:    uzp1 v3.16b, v0.16b, v3.16b
-; CHECK-NEXT:    umull2 v16.8h, v0.16b, v16.16b
-; CHECK-NEXT:    umull2 v17.8h, v0.16b, v17.16b
-; CHECK-NEXT:    umull v18.8h, v0.8b, v18.8b
-; CHECK-NEXT:    xtn v24.8b, v24.8h
-; CHECK-NEXT:    umull v19.8h, v0.8b, v19.8b
-; CHECK-NEXT:    umull v20.8h, v0.8b, v20.8b
-; CHECK-NEXT:    umull v21.8h, v0.8b, v21.8b
-; CHECK-NEXT:    umull v22.8h, v0.8b, v22.8b
-; CHECK-NEXT:    umull v23.8h, v0.8b, v23.8b
-; CHECK-NEXT:    bic v2.8h, #127
-; CHECK-NEXT:    bic v1.8h, #127
-; CHECK-NEXT:    eor v4.16b, v5.16b, v4.16b
-; CHECK-NEXT:    eor v5.16b, v6.16b, v7.16b
-; CHECK-NEXT:    umull2 v3.8h, v0.16b, v3.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v17.16b
-; CHECK-NEXT:    umull v7.8h, v0.8b, v24.8b
-; CHECK-NEXT:    eor v16.16b, v19.16b, v18.16b
-; CHECK-NEXT:    eor v17.16b, v20.16b, v21.16b
-; CHECK-NEXT:    eor v18.16b, v22.16b, v23.16b
-; CHECK-NEXT:    uzp1 v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    xtn v1.8b, v1.8h
-; CHECK-NEXT:    eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT:    eor v3.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v5.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v6.16b, v18.16b, v7.16b
-; CHECK-NEXT:    umull2 v2.8h, v0.16b, v2.16b
-; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT:    eor v1.16b, v4.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v5.16b, v6.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEXT:    ushr v2.8b, v4.8b, #1
+; CHECK-NEXT:    ushr v3.8b, v5.8b, #1
+; CHECK-NEXT:    shll v2.8h, v2.8b, #8
+; CHECK-NEXT:    shll v3.8h, v3.8b, #8
+; CHECK-NEXT:    orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    orr v1.16b, v1.16b, v3.16b
 ; CHECK-NEXT:    ret
   %zextx = zext <16 x i8> %x to <16 x i16>
   %zexty = zext <16 x i8> %y to <16 x i16>
@@ -1867,86 +1605,74 @@ define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) {
 define <4 x i32> @clmul_v4i32_neon_zext(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-LABEL: clmul_v4i32_neon_zext:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #2
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    movi v4.4s, #4
-; CHECK-NEXT:    movi v5.4s, #8
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    movi v6.4s, #16
-; CHECK-NEXT:    movi v7.4s, #32
-; CHECK-NEXT:    movi v16.4s, #128
-; CHECK-NEXT:    movi v17.4s, #1, lsl #8
-; CHECK-NEXT:    movi v18.4s, #8, lsl #8
-; CHECK-NEXT:    movi v19.4s, #16, lsl #8
-; CHECK-NEXT:    movi v20.4s, #64
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    movi v21.4s, #2, lsl #8
-; CHECK-NEXT:    movi v22.4s, #32, lsl #8
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    xtn v4.4h, v4.4s
-; CHECK-NEXT:    xtn v5.4h, v5.4s
-; CHECK-NEXT:    movi v23.4s, #4, lsl #8
-; CHECK-NEXT:    movi v24.4s, #64, lsl #8
-; CHECK-NEXT:    xtn v6.4h, v6.4s
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    xtn v16.4h, v16.4s
-; CHECK-NEXT:    xtn v17.4h, v17.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    xtn v18.4h, v18.4s
-; CHECK-NEXT:    xtn v19.4h, v19.4s
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v5.4s, v0.4h, v5.4h
-; CHECK-NEXT:    movi v25.4s, #128, lsl #8
-; CHECK-NEXT:    xtn v20.4h, v20.4s
-; CHECK-NEXT:    xtn v21.4h, v21.4s
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    xtn v22.4h, v22.4s
-; CHECK-NEXT:    and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT:    umull v6.4s, v0.4h, v6.4h
-; CHECK-NEXT:    umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT:    umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT:    umull v17.4s, v0.4h, v17.4h
-; CHECK-NEXT:    umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT:    umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v25.16b
-; CHECK-NEXT:    xtn v4.4h, v23.4s
-; CHECK-NEXT:    xtn v5.4h, v24.4s
-; CHECK-NEXT:    umull v20.4s, v0.4h, v20.4h
-; CHECK-NEXT:    umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT:    umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    umull v3.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v5.4h
-; CHECK-NEXT:    eor v5.16b, v6.16b, v20.16b
-; CHECK-NEXT:    eor v6.16b, v7.16b, v21.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v22.16b
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v2.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v4.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    rev16 v3.8b, v1.8b
+; CHECK-NEXT:    rev16 v4.8b, v0.8b
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    xtn v17.8b, v1.8h
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v3.8b, v3.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    xtn v18.8b, v2.8h
+; CHECK-NEXT:    rbit v19.8b, v17.8b
+; CHECK-NEXT:    shrn v2.8b, v2.8h, #8
+; CHECK-NEXT:    xtn v5.8b, v3.8h
+; CHECK-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEXT:    rbit v20.8b, v18.8b
+; CHECK-NEXT:    rbit v7.8b, v5.8b
+; CHECK-NEXT:    rbit v16.8b, v6.8b
+; CHECK-NEXT:    pmul v4.8b, v4.8b, v5.8b
+; CHECK-NEXT:    pmul v3.8b, v6.8b, v3.8b
+; CHECK-NEXT:    pmul v5.8b, v6.8b, v5.8b
+; CHECK-NEXT:    pmul v6.8b, v2.8b, v17.8b
+; CHECK-NEXT:    pmul v7.8b, v16.8b, v7.8b
+; CHECK-NEXT:    xtn v16.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    eor v3.8b, v3.8b, v4.8b
+; CHECK-NEXT:    pmul v4.8b, v20.8b, v19.8b
+; CHECK-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-NEXT:    rbit v7.8b, v7.8b
+; CHECK-NEXT:    rbit v21.8b, v16.8b
+; CHECK-NEXT:    pmul v2.8b, v16.8b, v2.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEXT:    pmul v20.8b, v21.8b, v20.8b
+; CHECK-NEXT:    pmul v19.8b, v21.8b, v19.8b
+; CHECK-NEXT:    ushr v4.8b, v4.8b, #1
+; CHECK-NEXT:    eor v3.8b, v7.8b, v3.8b
+; CHECK-NEXT:    pmul v7.8b, v18.8b, v1.8b
+; CHECK-NEXT:    pmul v18.8b, v0.8b, v18.8b
+; CHECK-NEXT:    rbit v20.8b, v20.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v17.8b
+; CHECK-NEXT:    pmul v1.8b, v16.8b, v1.8b
+; CHECK-NEXT:    shll v3.8h, v3.8b, #8
+; CHECK-NEXT:    eor v6.8b, v7.8b, v6.8b
+; CHECK-NEXT:    eor v2.8b, v2.8b, v18.8b
+; CHECK-NEXT:    ushr v7.8b, v20.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    orr v3.16b, v5.16b, v3.16b
+; CHECK-NEXT:    rbit v5.8b, v19.8b
+; CHECK-NEXT:    eor v4.8b, v4.8b, v6.8b
+; CHECK-NEXT:    eor v2.8b, v7.8b, v2.8b
+; CHECK-NEXT:    rev16 v3.8b, v3.8b
+; CHECK-NEXT:    ushr v1.8b, v5.8b, #1
+; CHECK-NEXT:    pmul v5.8b, v16.8b, v17.8b
+; CHECK-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEXT:    shll v2.8h, v2.8b, #8
+; CHECK-NEXT:    rbit v3.8b, v3.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    eor v1.8b, v2.8b, v4.8b
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    ushr v2.4h, v3.4h, #1
+; CHECK-NEXT:    ushll v3.8h, v5.8b, #0
+; CHECK-NEXT:    eor v1.8b, v2.8b, v1.8b
+; CHECK-NEXT:    orr v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    shll v1.4s, v1.4h, #16
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %zextx = zext <4 x i16> %x to <4 x i32>
   %zexty = zext <4 x i16> %y to <4 x i32>
@@ -1966,152 +1692,138 @@ define <8 x i32> @clmul_v8i32_neon_zext(<8 x i16> %x, <8 x i16> %y) {
 ; CHECK-NEXT:    .cfi_offset b10, -24
 ; CHECK-NEXT:    .cfi_offset b11, -32
 ; CHECK-NEXT:    .cfi_offset b12, -48
-; CHECK-NEXT:    movi v19.4s, #2
-; CHECK-NEXT:    movi v21.4s, #1
-; CHECK-NEXT:    ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT:    movi v17.4s, #4
-; CHECK-NEXT:    movi v20.4s, #8
-; CHECK-NEXT:    movi v5.4s, #16
-; CHECK-NEXT:    movi v4.4s, #32
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    and v3.16b, v2.16b, v19.16b
-; CHECK-NEXT:    and v6.16b, v2.16b, v21.16b
-; CHECK-NEXT:    and v7.16b, v2.16b, v17.16b
-; CHECK-NEXT:    and v16.16b, v2.16b, v20.16b
-; CHECK-NEXT:    and v18.16b, v2.16b, v5.16b
-; CHECK-NEXT:    and v22.16b, v2.16b, v4.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    uzp1 v23.8h, v0.8h, v3.8h
-; CHECK-NEXT:    movi v3.4s, #64
-; CHECK-NEXT:    uzp1 v24.8h, v0.8h, v6.8h
-; CHECK-NEXT:    movi v6.4s, #128
-; CHECK-NEXT:    uzp1 v25.8h, v0.8h, v7.8h
-; CHECK-NEXT:    movi v7.4s, #1, lsl #8
-; CHECK-NEXT:    uzp1 v26.8h, v0.8h, v16.8h
-; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v18.8h
-; CHECK-NEXT:    uzp1 v28.8h, v0.8h, v22.8h
-; CHECK-NEXT:    movi v16.4s, #8, lsl #8
-; CHECK-NEXT:    movi v18.4s, #16, lsl #8
-; CHECK-NEXT:    movi v22.4s, #2, lsl #8
-; CHECK-NEXT:    umull2 v29.4s, v0.8h, v23.8h
-; CHECK-NEXT:    and v23.16b, v2.16b, v3.16b
-; CHECK-NEXT:    umull2 v24.4s, v0.8h, v24.8h
-; CHECK-NEXT:    and v30.16b, v2.16b, v6.16b
-; CHECK-NEXT:    and v31.16b, v2.16b, v7.16b
-; CHECK-NEXT:    umull2 v25.4s, v0.8h, v25.8h
-; CHECK-NEXT:    umull2 v26.4s, v0.8h, v26.8h
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v27.8h
-; CHECK-NEXT:    umull2 v28.4s, v0.8h, v28.8h
-; CHECK-NEXT:    uzp1 v10.8h, v0.8h, v23.8h
-; CHECK-NEXT:    movi v23.4s, #32, lsl #8
-; CHECK-NEXT:    and v8.16b, v2.16b, v16.16b
-; CHECK-NEXT:    and v9.16b, v2.16b, v18.16b
-; CHECK-NEXT:    uzp1 v30.8h, v0.8h, v30.8h
-; CHECK-NEXT:    uzp1 v31.8h, v0.8h, v31.8h
-; CHECK-NEXT:    and v11.16b, v2.16b, v22.16b
-; CHECK-NEXT:    eor v24.16b, v24.16b, v29.16b
-; CHECK-NEXT:    xtn v12.4h, v19.4s
-; CHECK-NEXT:    uzp1 v8.8h, v0.8h, v8.8h
-; CHECK-NEXT:    eor v25.16b, v25.16b, v26.16b
-; CHECK-NEXT:    eor v26.16b, v27.16b, v28.16b
-; CHECK-NEXT:    uzp1 v9.8h, v0.8h, v9.8h
-; CHECK-NEXT:    and v29.16b, v2.16b, v23.16b
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v10.8h
-; CHECK-NEXT:    umull2 v28.4s, v0.8h, v30.8h
-; CHECK-NEXT:    uzp1 v30.8h, v0.8h, v11.8h
-; CHECK-NEXT:    umull2 v31.4s, v0.8h, v31.8h
-; CHECK-NEXT:    and v11.16b, v1.16b, v17.16b
-; CHECK-NEXT:    eor v17.16b, v24.16b, v25.16b
-; CHECK-NEXT:    and v10.16b, v1.16b, v21.16b
-; CHECK-NEXT:    uzp1 v29.8h, v0.8h, v29.8h
-; CHECK-NEXT:    umull2 v8.4s, v0.8h, v8.8h
-; CHECK-NEXT:    movi v21.4s, #4, lsl #8
-; CHECK-NEXT:    umull2 v9.4s, v0.8h, v9.8h
-; CHECK-NEXT:    eor v19.16b, v26.16b, v27.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    umull2 v24.4s, v0.8h, v30.8h
-; CHECK-NEXT:    eor v25.16b, v28.16b, v31.16b
-; CHECK-NEXT:    xtn v28.4h, v11.4s
-; CHECK-NEXT:    xtn v30.4h, v20.4s
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    umull2 v27.4s, v0.8h, v29.8h
-; CHECK-NEXT:    xtn v10.4h, v10.4s
-; CHECK-NEXT:    and v29.16b, v2.16b, v21.16b
-; CHECK-NEXT:    eor v26.16b, v8.16b, v9.16b
-; CHECK-NEXT:    and v9.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    eor v20.16b, v25.16b, v24.16b
-; CHECK-NEXT:    and v25.16b, v1.16b, v5.16b
-; CHECK-NEXT:    umull v28.4s, v0.4h, v28.4h
-; CHECK-NEXT:    umull v30.4s, v0.4h, v30.4h
-; CHECK-NEXT:    movi v24.4s, #64, lsl #8
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    eor v4.16b, v26.16b, v27.16b
-; CHECK-NEXT:    and v26.16b, v1.16b, v6.16b
-; CHECK-NEXT:    xtn v27.4h, v9.4s
-; CHECK-NEXT:    xtn v25.4h, v25.4s
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    xtn v16.4h, v16.4s
-; CHECK-NEXT:    xtn v18.4h, v18.4s
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    uzp1 v5.8h, v0.8h, v29.8h
-; CHECK-NEXT:    xtn v26.4h, v26.4s
-; CHECK-NEXT:    eor v28.16b, v28.16b, v30.16b
-; CHECK-NEXT:    movi v30.4s, #128, lsl #8
-; CHECK-NEXT:    umull v27.4s, v0.4h, v27.4h
-; CHECK-NEXT:    and v29.16b, v2.16b, v24.16b
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    umull v25.4s, v0.4h, v25.4h
-; CHECK-NEXT:    xtn v22.4h, v22.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    xtn v23.4h, v23.4s
-; CHECK-NEXT:    and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT:    umull v31.4s, v0.4h, v12.4h
-; CHECK-NEXT:    umull v8.4s, v0.4h, v10.4h
-; CHECK-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    umull v26.4s, v0.4h, v26.4h
-; CHECK-NEXT:    umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT:    umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT:    umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT:    eor v25.16b, v25.16b, v27.16b
-; CHECK-NEXT:    uzp1 v27.8h, v0.8h, v29.8h
-; CHECK-NEXT:    and v2.16b, v2.16b, v30.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v30.16b
-; CHECK-NEXT:    xtn v21.4h, v21.4s
-; CHECK-NEXT:    xtn v24.4h, v24.4s
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT:    umull v23.4s, v0.4h, v23.4h
-; CHECK-NEXT:    eor v6.16b, v8.16b, v31.16b
-; CHECK-NEXT:    eor v7.16b, v26.16b, v7.16b
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    rev16 v5.8b, v1.8b
+; CHECK-NEXT:    rev16 v6.8b, v0.8b
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    rev16 v7.8b, v3.8b
+; CHECK-NEXT:    rev16 v17.8b, v2.8b
+; CHECK-NEXT:    rbit v18.8b, v5.8b
+; CHECK-NEXT:    rbit v19.8b, v6.8b
+; CHECK-NEXT:    xtn v5.8b, v1.8h
+; CHECK-NEXT:    xtn v16.8b, v4.8h
+; CHECK-NEXT:    shrn v29.8b, v4.8h, #8
+; CHECK-NEXT:    xtn v6.8b, v0.8h
+; CHECK-NEXT:    shrn v4.8b, v0.8h, #8
+; CHECK-NEXT:    xtn v0.8b, v3.8h
+; CHECK-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEXT:    rbit v20.8b, v7.8b
+; CHECK-NEXT:    rbit v17.8b, v17.8b
+; CHECK-NEXT:    xtn v21.8b, v18.8h
+; CHECK-NEXT:    xtn v22.8b, v19.8h
+; CHECK-NEXT:    shrn v7.8b, v1.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v19.8h, #8
+; CHECK-NEXT:    shrn v18.8b, v18.8h, #8
+; CHECK-NEXT:    pmul v8.8b, v29.8b, v5.8b
+; CHECK-NEXT:    rbit v23.8b, v5.8b
+; CHECK-NEXT:    rbit v24.8b, v16.8b
+; CHECK-NEXT:    pmul v12.8b, v4.8b, v16.8b
+; CHECK-NEXT:    pmul v4.8b, v4.8b, v5.8b
+; CHECK-NEXT:    xtn v25.8b, v20.8h
+; CHECK-NEXT:    xtn v26.8b, v17.8h
+; CHECK-NEXT:    rbit v27.8b, v21.8b
+; CHECK-NEXT:    rbit v28.8b, v22.8b
+; CHECK-NEXT:    pmul v10.8b, v1.8b, v21.8b
+; CHECK-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEXT:    pmul v18.8b, v22.8b, v18.8b
+; CHECK-NEXT:    shrn v20.8b, v20.8h, #8
+; CHECK-NEXT:    pmul v9.8b, v16.8b, v7.8b
+; CHECK-NEXT:    xtn v1.8b, v2.8h
+; CHECK-NEXT:    pmul v21.8b, v22.8b, v21.8b
+; CHECK-NEXT:    pmul v19.8b, v24.8b, v23.8b
+; CHECK-NEXT:    rbit v30.8b, v25.8b
+; CHECK-NEXT:    rbit v31.8b, v26.8b
+; CHECK-NEXT:    pmul v17.8b, v17.8b, v25.8b
+; CHECK-NEXT:    pmul v27.8b, v28.8b, v27.8b
+; CHECK-NEXT:    pmul v20.8b, v26.8b, v20.8b
+; CHECK-NEXT:    rbit v28.8b, v6.8b
+; CHECK-NEXT:    eor v18.8b, v18.8b, v10.8b
+; CHECK-NEXT:    eor v8.8b, v9.8b, v8.8b
+; CHECK-NEXT:    rbit v9.8b, v0.8b
+; CHECK-NEXT:    rbit v10.8b, v1.8b
+; CHECK-NEXT:    pmul v22.8b, v26.8b, v25.8b
+; CHECK-NEXT:    shrn v2.8b, v2.8h, #8
+; CHECK-NEXT:    pmul v30.8b, v31.8b, v30.8b
+; CHECK-NEXT:    ushll v21.8h, v21.8b, #0
+; CHECK-NEXT:    rbit v19.8b, v19.8b
+; CHECK-NEXT:    rbit v27.8b, v27.8b
+; CHECK-NEXT:    eor v17.8b, v20.8b, v17.8b
+; CHECK-NEXT:    pmul v11.8b, v28.8b, v24.8b
+; CHECK-NEXT:    pmul v25.8b, v24.8b, v9.8b
+; CHECK-NEXT:    pmul v31.8b, v6.8b, v29.8b
+; CHECK-NEXT:    pmul v7.8b, v6.8b, v7.8b
+; CHECK-NEXT:    pmul v24.8b, v10.8b, v24.8b
+; CHECK-NEXT:    ushll v22.8h, v22.8b, #0
+; CHECK-NEXT:    pmul v5.8b, v6.8b, v5.8b
+; CHECK-NEXT:    rbit v30.8b, v30.8b
+; CHECK-NEXT:    ushr v19.8b, v19.8b, #1
+; CHECK-NEXT:    ushr v27.8b, v27.8b, #1
+; CHECK-NEXT:    rbit v11.8b, v11.8b
+; CHECK-NEXT:    rbit v25.8b, v25.8b
+; CHECK-NEXT:    eor v31.8b, v31.8b, v12.8b
+; CHECK-NEXT:    eor v4.8b, v7.8b, v4.8b
+; CHECK-NEXT:    rbit v24.8b, v24.8b
+; CHECK-NEXT:    eor v19.8b, v19.8b, v8.8b
+; CHECK-NEXT:    ushll v5.8h, v5.8b, #0
+; CHECK-NEXT:    eor v18.8b, v27.8b, v18.8b
+; CHECK-NEXT:    ushr v20.8b, v30.8b, #1
+; CHECK-NEXT:    pmul v27.8b, v16.8b, v3.8b
+; CHECK-NEXT:    pmul v16.8b, v2.8b, v16.8b
+; CHECK-NEXT:    pmul v2.8b, v2.8b, v0.8b
+; CHECK-NEXT:    pmul v3.8b, v1.8b, v3.8b
+; CHECK-NEXT:    ushr v26.8b, v11.8b, #1
+; CHECK-NEXT:    shll v19.8h, v19.8b, #8
+; CHECK-NEXT:    shll v18.8h, v18.8b, #8
+; CHECK-NEXT:    eor v17.8b, v20.8b, v17.8b
+; CHECK-NEXT:    pmul v20.8b, v28.8b, v23.8b
+; CHECK-NEXT:    pmul v28.8b, v1.8b, v29.8b
+; CHECK-NEXT:    pmul v23.8b, v29.8b, v0.8b
+; CHECK-NEXT:    ushr v24.8b, v24.8b, #1
+; CHECK-NEXT:    eor v26.8b, v26.8b, v31.8b
+; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    shll v17.8h, v17.8b, #8
+; CHECK-NEXT:    orr v18.16b, v21.16b, v18.16b
+; CHECK-NEXT:    pmul v21.8b, v10.8b, v9.8b
 ; CHECK-NEXT:    ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    eor v16.16b, v16.16b, v18.16b
-; CHECK-NEXT:    uzp1 v2.8h, v0.8h, v2.8h
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    umull2 v5.4s, v0.8h, v5.8h
-; CHECK-NEXT:    umull2 v18.4s, v0.8h, v27.8h
-; CHECK-NEXT:    umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT:    umull v24.4s, v0.4h, v24.4h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v28.16b
-; CHECK-NEXT:    eor v3.16b, v25.16b, v3.16b
-; CHECK-NEXT:    eor v7.16b, v7.16b, v22.16b
-; CHECK-NEXT:    eor v16.16b, v16.16b, v23.16b
-; CHECK-NEXT:    eor v17.16b, v17.16b, v19.16b
-; CHECK-NEXT:    umull2 v2.4s, v0.8h, v2.8h
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    eor v5.16b, v20.16b, v5.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v18.16b
-; CHECK-NEXT:    eor v1.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v21.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v24.16b
-; CHECK-NEXT:    eor v5.16b, v17.16b, v5.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    eor v0.16b, v6.16b, v0.16b
-; CHECK-NEXT:    eor v1.16b, v5.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT:    rbit v20.8b, v20.8b
+; CHECK-NEXT:    ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    eor v16.8b, v28.8b, v16.8b
+; CHECK-NEXT:    orr v17.16b, v22.16b, v17.16b
+; CHECK-NEXT:    eor v23.8b, v27.8b, v23.8b
+; CHECK-NEXT:    ushr v22.8b, v25.8b, #1
+; CHECK-NEXT:    rbit v21.8b, v21.8b
+; CHECK-NEXT:    rev16 v18.8b, v18.8b
+; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-NEXT:    ushr v7.8b, v20.8b, #1
+; CHECK-NEXT:    eor v16.8b, v24.8b, v16.8b
+; CHECK-NEXT:    rev16 v17.8b, v17.8b
+; CHECK-NEXT:    eor v20.8b, v22.8b, v23.8b
+; CHECK-NEXT:    shll v22.8h, v26.8b, #8
+; CHECK-NEXT:    ushr v3.8b, v21.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEXT:    eor v4.8b, v7.8b, v4.8b
+; CHECK-NEXT:    shll v7.8h, v16.8b, #8
+; CHECK-NEXT:    shll v6.8h, v20.8b, #8
+; CHECK-NEXT:    rbit v16.8b, v17.8b
+; CHECK-NEXT:    eor v1.8b, v3.8b, v2.8b
+; CHECK-NEXT:    eor v2.8b, v22.8b, v19.8b
+; CHECK-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEXT:    ushr v3.4h, v18.4h, #1
+; CHECK-NEXT:    eor v6.8b, v7.8b, v6.8b
+; CHECK-NEXT:    ushr v7.4h, v16.4h, #1
+; CHECK-NEXT:    shll v1.8h, v1.8b, #8
+; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    orr v3.16b, v5.16b, v4.16b
+; CHECK-NEXT:    eor v4.8b, v7.8b, v6.8b
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEXT:    ushll v2.4s, v3.4h, #0
+; CHECK-NEXT:    shll v3.4s, v4.4h, #16
+; CHECK-NEXT:    ushll v4.4s, v0.4h, #0
+; CHECK-NEXT:    orr v0.16b, v2.16b, v1.16b
+; CHECK-NEXT:    orr v1.16b, v4.16b, v3.16b
 ; CHECK-NEXT:    ldr d12, [sp], #48 // 8-byte Folded Reload
 ; CHECK-NEXT:    ret
   %zextx = zext <8 x i16> %x to <8 x i32>
@@ -4525,72 +4237,26 @@ define <8 x i16> @clmulr_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; CHECK-LABEL: clmulr_v8i16_neon:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev16 v1.16b, v1.16b
-; CHECK-NEXT:    rev16 v3.16b, v0.16b
-; CHECK-NEXT:    movi v2.8h, #2
-; CHECK-NEXT:    movi v4.8h, #1
-; CHECK-NEXT:    movi v5.8h, #4
-; CHECK-NEXT:    movi v6.8h, #8
-; CHECK-NEXT:    movi v7.8h, #16
-; CHECK-NEXT:    movi v16.8h, #32
-; CHECK-NEXT:    movi v17.8h, #128
-; CHECK-NEXT:    movi v18.8h, #1, lsl #8
-; CHECK-NEXT:    movi v19.8h, #8, lsl #8
-; CHECK-NEXT:    movi v20.8h, #16, lsl #8
-; CHECK-NEXT:    rbit v0.16b, v1.16b
-; CHECK-NEXT:    rbit v1.16b, v3.16b
-; CHECK-NEXT:    movi v3.8h, #64
-; CHECK-NEXT:    movi v21.8h, #2, lsl #8
-; CHECK-NEXT:    movi v22.8h, #32, lsl #8
-; CHECK-NEXT:    movi v23.8h, #4, lsl #8
-; CHECK-NEXT:    movi v24.8h, #64, lsl #8
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT:    mul v2.8h, v1.8h, v2.8h
-; CHECK-NEXT:    mul v4.8h, v1.8h, v4.8h
-; CHECK-NEXT:    mul v5.8h, v1.8h, v5.8h
-; CHECK-NEXT:    mul v6.8h, v1.8h, v6.8h
-; CHECK-NEXT:    mul v7.8h, v1.8h, v7.8h
-; CHECK-NEXT:    mul v16.8h, v1.8h, v16.8h
-; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT:    mul v17.8h, v1.8h, v17.8h
-; CHECK-NEXT:    mul v18.8h, v1.8h, v18.8h
-; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT:    mul v19.8h, v1.8h, v19.8h
-; CHECK-NEXT:    mul v20.8h, v1.8h, v20.8h
-; CHECK-NEXT:    and v22.16b, v0.16b, v22.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT:    movi v6.8h, #128, lsl #8
-; CHECK-NEXT:    mul v3.8h, v1.8h, v3.8h
-; CHECK-NEXT:    mul v5.8h, v1.8h, v21.8h
-; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT:    and v23.16b, v0.16b, v24.16b
-; CHECK-NEXT:    mul v22.8h, v1.8h, v22.8h
-; CHECK-NEXT:    eor v7.16b, v7.16b, v16.16b
-; CHECK-NEXT:    eor v16.16b, v17.16b, v18.16b
-; CHECK-NEXT:    eor v17.16b, v19.16b, v20.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v6.16b
-; CHECK-NEXT:    mul v4.8h, v1.8h, v21.8h
-; CHECK-NEXT:    mul v6.8h, v1.8h, v23.8h
-; CHECK-NEXT:    eor v3.16b, v7.16b, v3.16b
-; CHECK-NEXT:    eor v5.16b, v16.16b, v5.16b
-; CHECK-NEXT:    eor v7.16b, v17.16b, v22.16b
-; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v5.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v6.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-NEXT:    rbit v1.16b, v1.16b
+; CHECK-NEXT:    rbit v0.16b, v0.16b
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    rev16 v0.16b, v0.16b
 ; CHECK-NEXT:    rbit v0.16b, v0.16b
 ; CHECK-NEXT:    ret
@@ -4605,87 +4271,29 @@ define <8 x i16> @clmulr_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i16> @clmulr_v4i16_neon(<4 x i16> %a, <4 x i16> %b) nounwind {
 ; CHECK-LABEL: clmulr_v4i16_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #2
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    movi v4.4s, #4
-; CHECK-NEXT:    movi v5.4s, #8
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    movi v6.4s, #16
-; CHECK-NEXT:    movi v7.4s, #32
-; CHECK-NEXT:    movi v16.4s, #128
-; CHECK-NEXT:    movi v17.4s, #1, lsl #8
-; CHECK-NEXT:    movi v18.4s, #8, lsl #8
-; CHECK-NEXT:    movi v19.4s, #16, lsl #8
-; CHECK-NEXT:    movi v20.4s, #64
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    movi v21.4s, #2, lsl #8
-; CHECK-NEXT:    movi v22.4s, #32, lsl #8
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    xtn v4.4h, v4.4s
-; CHECK-NEXT:    xtn v5.4h, v5.4s
-; CHECK-NEXT:    movi v23.4s, #4, lsl #8
-; CHECK-NEXT:    movi v24.4s, #64, lsl #8
-; CHECK-NEXT:    xtn v6.4h, v6.4s
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    xtn v16.4h, v16.4s
-; CHECK-NEXT:    xtn v17.4h, v17.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    xtn v18.4h, v18.4s
-; CHECK-NEXT:    xtn v19.4h, v19.4s
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v5.4s, v0.4h, v5.4h
-; CHECK-NEXT:    movi v25.4s, #128, lsl #8
-; CHECK-NEXT:    xtn v20.4h, v20.4s
-; CHECK-NEXT:    xtn v21.4h, v21.4s
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    xtn v22.4h, v22.4s
-; CHECK-NEXT:    and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT:    umull v6.4s, v0.4h, v6.4h
-; CHECK-NEXT:    umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT:    umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT:    umull v17.4s, v0.4h, v17.4h
-; CHECK-NEXT:    umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT:    umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v25.16b
-; CHECK-NEXT:    xtn v4.4h, v23.4s
-; CHECK-NEXT:    xtn v5.4h, v24.4s
-; CHECK-NEXT:    umull v20.4s, v0.4h, v20.4h
-; CHECK-NEXT:    umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT:    umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    umull v3.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v5.4h
-; CHECK-NEXT:    eor v5.16b, v6.16b, v20.16b
-; CHECK-NEXT:    eor v6.16b, v7.16b, v21.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v22.16b
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v2.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v4.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    shrn v0.4h, v0.4s, #15
+; CHECK-NEXT:    rev16 v1.8b, v1.8b
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    rbit v1.8b, v1.8b
+; CHECK-NEXT:    rbit v0.8b, v0.8b
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    rbit v0.8b, v0.8b
 ; CHECK-NEXT:    ret
   %a.ext = zext <4 x i16> %a to <4 x i32>
   %b.ext = zext <4 x i16> %b to <4 x i32>
@@ -4699,136 +4307,87 @@ define <4 x i32> @clmulr_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; CHECK-LABEL: clmulr_v4i32_neon:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev32 v1.16b, v1.16b
-; CHECK-NEXT:    rev32 v2.16b, v0.16b
-; CHECK-NEXT:    movi v3.4s, #2
-; CHECK-NEXT:    movi v4.4s, #1
-; CHECK-NEXT:    movi v5.4s, #4
-; CHECK-NEXT:    movi v6.4s, #8
-; CHECK-NEXT:    movi v7.4s, #16
-; CHECK-NEXT:    movi v16.4s, #32
-; CHECK-NEXT:    movi v17.4s, #64
-; CHECK-NEXT:    movi v18.4s, #1, lsl #8
-; CHECK-NEXT:    movi v19.4s, #2, lsl #8
-; CHECK-NEXT:    movi v20.4s, #8, lsl #8
-; CHECK-NEXT:    rbit v0.16b, v1.16b
-; CHECK-NEXT:    rbit v1.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #128
-; CHECK-NEXT:    movi v21.4s, #16, lsl #8
-; CHECK-NEXT:    movi v22.4s, #8, lsl #16
-; CHECK-NEXT:    movi v23.4s, #2, lsl #24
-; CHECK-NEXT:    movi v25.4s, #4, lsl #24
-; CHECK-NEXT:    movi v24.4s, #32, lsl #16
-; CHECK-NEXT:    movi v26.4s, #8, lsl #24
-; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    mul v5.4s, v1.4s, v5.4s
-; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    mul v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    and v23.16b, v0.16b, v23.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT:    eor v6.16b, v7.16b, v16.16b
-; CHECK-NEXT:    mul v5.4s, v1.4s, v18.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v19.16b
-; CHECK-NEXT:    movi v18.4s, #32, lsl #8
-; CHECK-NEXT:    and v16.16b, v0.16b, v20.16b
-; CHECK-NEXT:    movi v19.4s, #1, lsl #16
-; CHECK-NEXT:    movi v20.4s, #4, lsl #8
-; CHECK-NEXT:    eor v3.16b, v3.16b, v4.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v21.16b
-; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT:    movi v17.4s, #2, lsl #16
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    eor v5.16b, v2.16b, v5.16b
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    movi v21.4s, #64, lsl #8
-; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    eor v2.16b, v3.16b, v6.16b
-; CHECK-NEXT:    and v3.16b, v0.16b, v19.16b
-; CHECK-NEXT:    movi v19.4s, #128, lsl #16
-; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v17.16b
-; CHECK-NEXT:    movi v17.4s, #64, lsl #16
-; CHECK-NEXT:    eor v5.16b, v5.16b, v7.16b
-; CHECK-NEXT:    mul v7.4s, v1.4s, v18.4s
-; CHECK-NEXT:    movi v18.4s, #4, lsl #16
-; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    eor v4.16b, v16.16b, v4.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v21.16b
-; CHECK-NEXT:    movi v21.4s, #1, lsl #24
-; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT:    mul v20.4s, v1.4s, v20.4s
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT:    movi v18.4s, #128, lsl #8
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    eor v5.16b, v5.16b, v20.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #16
-; CHECK-NEXT:    mul v21.4s, v1.4s, v21.4s
-; CHECK-NEXT:    eor v4.16b, v4.16b, v16.16b
-; CHECK-NEXT:    and v20.16b, v0.16b, v24.16b
-; CHECK-NEXT:    movi v24.4s, #64, lsl #24
-; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v16.16b, v17.16b, v19.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v18.4s, v1.4s, v23.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT:    and v19.16b, v0.16b, v25.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #24
-; CHECK-NEXT:    movi v23.4s, #32, lsl #24
-; CHECK-NEXT:    eor v16.16b, v16.16b, v21.16b
-; CHECK-NEXT:    and v21.16b, v0.16b, v26.16b
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    mul v5.4s, v1.4s, v20.4s
-; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v18.16b
-; CHECK-NEXT:    movi v16.4s, #128, lsl #24
-; CHECK-NEXT:    mul v18.4s, v1.4s, v21.4s
-; CHECK-NEXT:    and v20.16b, v0.16b, v22.16b
-; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT:    and v22.16b, v0.16b, v24.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v17.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    eor v6.16b, v6.16b, v19.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    mul v7.4s, v1.4s, v20.4s
-; CHECK-NEXT:    mul v16.4s, v1.4s, v21.4s
-; CHECK-NEXT:    mul v17.4s, v1.4s, v22.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v5.16b
-; CHECK-NEXT:    eor v4.16b, v6.16b, v18.16b
-; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev32 v0.16b, v0.16b
+; CHECK-NEXT:    rbit v1.16b, v1.16b
+; CHECK-NEXT:    rbit v2.16b, v0.16b
+; CHECK-NEXT:    xtn v0.4h, v1.4s
+; CHECK-NEXT:    xtn v3.4h, v2.4s
+; CHECK-NEXT:    shrn v16.4h, v2.4s, #16
+; CHECK-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEXT:    rev16 v4.8b, v0.8b
+; CHECK-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEXT:    xtn v1.8b, v0.8h
+; CHECK-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEXT:    xtn v2.8b, v3.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEXT:    rbit v22.8b, v1.8b
+; CHECK-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEXT:    rbit v24.8b, v2.8b
+; CHECK-NEXT:    pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEXT:    pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEXT:    pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEXT:    pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEXT:    pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEXT:    eor v0.8b, v0.8b, v3.8b
+; CHECK-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEXT:    pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEXT:    pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEXT:    pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v3.8b, v0.8b
+; CHECK-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    rev32 v0.16b, v0.16b
 ; CHECK-NEXT:    rbit v0.16b, v0.16b
 ; CHECK-NEXT:    ret
@@ -4843,209 +4402,106 @@ define <4 x i32> @clmulr_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <2 x i32> @clmulr_v2i32_neon(<2 x i32> %a, <2 x i32> %b) nounwind {
 ; CHECK-NEON-LABEL: clmulr_v2i32_neon:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    mov w8, #2 // =0x2
-; CHECK-NEON-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEON-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEON-NEXT:    dup v2.2d, x8
-; CHECK-NEON-NEXT:    dup v3.2d, x9
-; CHECK-NEON-NEXT:    mov w8, #4 // =0x4
-; CHECK-NEON-NEXT:    mov w9, #8 // =0x8
-; CHECK-NEON-NEXT:    dup v4.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEON-NEXT:    dup v5.2d, x9
-; CHECK-NEON-NEXT:    dup v6.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEON-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEON-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEON-NEXT:    mov w9, #512 // =0x200
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    mov w8, #256 // =0x100
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    umull v2.2d, v0.2s, v2.2s
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    mov w8, #2048 // =0x800
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT:    xtn v4.2s, v16.2d
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4096 // =0x1000
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x9
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8192 // =0x2000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v3.2s, v5.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    mov w8, #16384 // =0x4000
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    dup v6.2d, x8
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v17.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    dup v23.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    eor v4.16b, v3.16b, v17.16b
-; CHECK-NEON-NEXT:    movi v23.4s, #128, lsl #24
-; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v20.16b
-; CHECK-NEON-NEXT:    xtn v5.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
-; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v21.2d
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
-; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    fneg v23.2d, v23.2d
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v16.16b
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    xtn v18.2s, v20.2d
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #134217728 // =0x8000000
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #268435456 // =0x10000000
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v24.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
-; CHECK-NEON-NEXT:    dup v25.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    dup v26.2d, x8
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v18.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v22.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v24.16b
-; CHECK-NEON-NEXT:    and v24.16b, v1.16b, v25.16b
-; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v26.16b
-; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v17.16b, v16.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v5.16b
-; CHECK-NEON-NEXT:    xtn v16.2s, v22.2d
-; CHECK-NEON-NEXT:    xtn v17.2s, v24.2d
-; CHECK-NEON-NEXT:    xtn v22.2s, v25.2d
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v21.16b
-; CHECK-NEON-NEXT:    eor v6.16b, v7.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v6.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #31
+; CHECK-NEON-NEXT:    rev32 v1.8b, v1.8b
+; CHECK-NEON-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT:    rbit v1.8b, v1.8b
+; CHECK-NEON-NEXT:    rbit v2.8b, v0.8b
+; CHECK-NEON-NEXT:    xtn v0.4h, v1.4s
+; CHECK-NEON-NEXT:    xtn v3.4h, v2.4s
+; CHECK-NEON-NEXT:    shrn v16.4h, v2.4s, #16
+; CHECK-NEON-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEON-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEON-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEON-NEXT:    rev16 v4.8b, v0.8b
+; CHECK-NEON-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEON-NEXT:    xtn v1.8b, v0.8h
+; CHECK-NEON-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEON-NEXT:    xtn v2.8b, v3.8h
+; CHECK-NEON-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEON-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEON-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEON-NEXT:    rbit v22.8b, v1.8b
+; CHECK-NEON-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEON-NEXT:    rbit v24.8b, v2.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEON-NEXT:    pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEON-NEXT:    pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEON-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEON-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEON-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEON-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEON-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEON-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEON-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEON-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEON-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEON-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEON-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEON-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEON-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEON-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEON-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEON-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEON-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEON-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEON-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEON-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEON-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEON-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEON-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v0.8b, v3.8b, v0.8b
+; CHECK-NEON-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEON-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEON-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEON-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEON-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT:    rbit v0.8b, v0.8b
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-AES-LABEL: clmulr_v2i32_neon:
 ; CHECK-AES:       // %bb.0:
+; CHECK-AES-NEXT:    rev32 v1.8b, v1.8b
+; CHECK-AES-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT:    rbit v1.8b, v1.8b
+; CHECK-AES-NEXT:    rbit v0.8b, v0.8b
 ; CHECK-AES-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-AES-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-AES-NEXT:    pmull2 v2.1q, v0.2d, v1.2d
 ; CHECK-AES-NEXT:    pmull v0.1q, v0.1d, v1.1d
 ; CHECK-AES-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-AES-NEXT:    shrn v0.2s, v0.2d, #31
+; CHECK-AES-NEXT:    xtn v0.2s, v0.2d
+; CHECK-AES-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT:    rbit v0.8b, v0.8b
 ; CHECK-AES-NEXT:    ret
   %a.ext = zext <2 x i32> %a to <2 x i64>
   %b.ext = zext <2 x i32> %b to <2 x i64>
@@ -5113,72 +4569,26 @@ define <8 x i16> @clmulh_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; CHECK-LABEL: clmulh_v8i16_neon:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev16 v1.16b, v1.16b
-; CHECK-NEXT:    rev16 v3.16b, v0.16b
-; CHECK-NEXT:    movi v2.8h, #2
-; CHECK-NEXT:    movi v4.8h, #1
-; CHECK-NEXT:    movi v5.8h, #4
-; CHECK-NEXT:    movi v6.8h, #8
-; CHECK-NEXT:    movi v7.8h, #16
-; CHECK-NEXT:    movi v16.8h, #32
-; CHECK-NEXT:    movi v17.8h, #128
-; CHECK-NEXT:    movi v18.8h, #1, lsl #8
-; CHECK-NEXT:    movi v19.8h, #8, lsl #8
-; CHECK-NEXT:    movi v20.8h, #16, lsl #8
-; CHECK-NEXT:    rbit v0.16b, v1.16b
-; CHECK-NEXT:    rbit v1.16b, v3.16b
-; CHECK-NEXT:    movi v3.8h, #64
-; CHECK-NEXT:    movi v21.8h, #2, lsl #8
-; CHECK-NEXT:    movi v22.8h, #32, lsl #8
-; CHECK-NEXT:    movi v23.8h, #4, lsl #8
-; CHECK-NEXT:    movi v24.8h, #64, lsl #8
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT:    mul v2.8h, v1.8h, v2.8h
-; CHECK-NEXT:    mul v4.8h, v1.8h, v4.8h
-; CHECK-NEXT:    mul v5.8h, v1.8h, v5.8h
-; CHECK-NEXT:    mul v6.8h, v1.8h, v6.8h
-; CHECK-NEXT:    mul v7.8h, v1.8h, v7.8h
-; CHECK-NEXT:    mul v16.8h, v1.8h, v16.8h
-; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT:    mul v17.8h, v1.8h, v17.8h
-; CHECK-NEXT:    mul v18.8h, v1.8h, v18.8h
-; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT:    mul v19.8h, v1.8h, v19.8h
-; CHECK-NEXT:    mul v20.8h, v1.8h, v20.8h
-; CHECK-NEXT:    and v22.16b, v0.16b, v22.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT:    movi v6.8h, #128, lsl #8
-; CHECK-NEXT:    mul v3.8h, v1.8h, v3.8h
-; CHECK-NEXT:    mul v5.8h, v1.8h, v21.8h
-; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT:    and v23.16b, v0.16b, v24.16b
-; CHECK-NEXT:    mul v22.8h, v1.8h, v22.8h
-; CHECK-NEXT:    eor v7.16b, v7.16b, v16.16b
-; CHECK-NEXT:    eor v16.16b, v17.16b, v18.16b
-; CHECK-NEXT:    eor v17.16b, v19.16b, v20.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v6.16b
-; CHECK-NEXT:    mul v4.8h, v1.8h, v21.8h
-; CHECK-NEXT:    mul v6.8h, v1.8h, v23.8h
-; CHECK-NEXT:    eor v3.16b, v7.16b, v3.16b
-; CHECK-NEXT:    eor v5.16b, v16.16b, v5.16b
-; CHECK-NEXT:    eor v7.16b, v17.16b, v22.16b
-; CHECK-NEXT:    mul v0.8h, v1.8h, v0.8h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v5.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v6.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev16 v0.16b, v0.16b
+; CHECK-NEXT:    rbit v1.16b, v1.16b
+; CHECK-NEXT:    rbit v0.16b, v0.16b
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
 ; CHECK-NEXT:    rev16 v0.16b, v0.16b
 ; CHECK-NEXT:    rbit v0.16b, v0.16b
 ; CHECK-NEXT:    ushr v0.8h, v0.8h, #1
@@ -5194,87 +4604,30 @@ define <8 x i16> @clmulh_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
 define <4 x i16> @clmulh_v4i16_neon(<4 x i16> %a, <4 x i16> %b) nounwind {
 ; CHECK-LABEL: clmulh_v4i16_neon:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v2.4s, #2
-; CHECK-NEXT:    movi v3.4s, #1
-; CHECK-NEXT:    movi v4.4s, #4
-; CHECK-NEXT:    movi v5.4s, #8
-; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
-; CHECK-NEXT:    movi v6.4s, #16
-; CHECK-NEXT:    movi v7.4s, #32
-; CHECK-NEXT:    movi v16.4s, #128
-; CHECK-NEXT:    movi v17.4s, #1, lsl #8
-; CHECK-NEXT:    movi v18.4s, #8, lsl #8
-; CHECK-NEXT:    movi v19.4s, #16, lsl #8
-; CHECK-NEXT:    movi v20.4s, #64
-; CHECK-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT:    movi v21.4s, #2, lsl #8
-; CHECK-NEXT:    movi v22.4s, #32, lsl #8
-; CHECK-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT:    xtn v2.4h, v2.4s
-; CHECK-NEXT:    xtn v3.4h, v3.4s
-; CHECK-NEXT:    xtn v4.4h, v4.4s
-; CHECK-NEXT:    xtn v5.4h, v5.4s
-; CHECK-NEXT:    movi v23.4s, #4, lsl #8
-; CHECK-NEXT:    movi v24.4s, #64, lsl #8
-; CHECK-NEXT:    xtn v6.4h, v6.4s
-; CHECK-NEXT:    xtn v7.4h, v7.4s
-; CHECK-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT:    xtn v16.4h, v16.4s
-; CHECK-NEXT:    xtn v17.4h, v17.4s
-; CHECK-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT:    xtn v18.4h, v18.4s
-; CHECK-NEXT:    xtn v19.4h, v19.4s
-; CHECK-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT:    umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT:    umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v5.4s, v0.4h, v5.4h
-; CHECK-NEXT:    movi v25.4s, #128, lsl #8
-; CHECK-NEXT:    xtn v20.4h, v20.4s
-; CHECK-NEXT:    xtn v21.4h, v21.4s
-; CHECK-NEXT:    and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT:    xtn v22.4h, v22.4s
-; CHECK-NEXT:    and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT:    umull v6.4s, v0.4h, v6.4h
-; CHECK-NEXT:    umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT:    umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT:    umull v17.4s, v0.4h, v17.4h
-; CHECK-NEXT:    umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT:    umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT:    and v1.16b, v1.16b, v25.16b
-; CHECK-NEXT:    xtn v4.4h, v23.4s
-; CHECK-NEXT:    xtn v5.4h, v24.4s
-; CHECK-NEXT:    umull v20.4s, v0.4h, v20.4h
-; CHECK-NEXT:    umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT:    umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT:    xtn v1.4h, v1.4s
-; CHECK-NEXT:    umull v3.4s, v0.4h, v4.4h
-; CHECK-NEXT:    umull v4.4s, v0.4h, v5.4h
-; CHECK-NEXT:    eor v5.16b, v6.16b, v20.16b
-; CHECK-NEXT:    eor v6.16b, v7.16b, v21.16b
-; CHECK-NEXT:    eor v7.16b, v16.16b, v22.16b
-; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    eor v1.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v2.16b, v6.16b, v3.16b
-; CHECK-NEXT:    eor v3.16b, v7.16b, v4.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-NEXT:    rev16 v1.8b, v1.8b
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    rbit v1.8b, v1.8b
+; CHECK-NEXT:    rbit v0.8b, v0.8b
+; CHECK-NEXT:    xtn v2.8b, v1.8h
+; CHECK-NEXT:    xtn v3.8b, v0.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v1.8b, v1.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v2.8b
+; CHECK-NEXT:    rbit v5.8b, v3.8b
+; CHECK-NEXT:    pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT:    pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    ushr v1.8b, v4.8b, #1
+; CHECK-NEXT:    eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT:    ushll v1.8h, v2.8b, #0
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev16 v0.8b, v0.8b
+; CHECK-NEXT:    rbit v0.8b, v0.8b
+; CHECK-NEXT:    ushr v0.4h, v0.4h, #1
 ; CHECK-NEXT:    ret
   %a.ext = zext <4 x i16> %a to <4 x i32>
   %b.ext = zext <4 x i16> %b to <4 x i32>
@@ -5288,136 +4641,87 @@ define <4 x i32> @clmulh_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; CHECK-LABEL: clmulh_v4i32_neon:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    rev32 v1.16b, v1.16b
-; CHECK-NEXT:    rev32 v2.16b, v0.16b
-; CHECK-NEXT:    movi v3.4s, #2
-; CHECK-NEXT:    movi v4.4s, #1
-; CHECK-NEXT:    movi v5.4s, #4
-; CHECK-NEXT:    movi v6.4s, #8
-; CHECK-NEXT:    movi v7.4s, #16
-; CHECK-NEXT:    movi v16.4s, #32
-; CHECK-NEXT:    movi v17.4s, #64
-; CHECK-NEXT:    movi v18.4s, #1, lsl #8
-; CHECK-NEXT:    movi v19.4s, #2, lsl #8
-; CHECK-NEXT:    movi v20.4s, #8, lsl #8
-; CHECK-NEXT:    rbit v0.16b, v1.16b
-; CHECK-NEXT:    rbit v1.16b, v2.16b
-; CHECK-NEXT:    movi v2.4s, #128
-; CHECK-NEXT:    movi v21.4s, #16, lsl #8
-; CHECK-NEXT:    movi v22.4s, #8, lsl #16
-; CHECK-NEXT:    movi v23.4s, #2, lsl #24
-; CHECK-NEXT:    movi v25.4s, #4, lsl #24
-; CHECK-NEXT:    movi v24.4s, #32, lsl #16
-; CHECK-NEXT:    movi v26.4s, #8, lsl #24
-; CHECK-NEXT:    and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT:    and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    mul v5.4s, v1.4s, v5.4s
-; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    mul v2.4s, v1.4s, v2.4s
-; CHECK-NEXT:    and v23.16b, v0.16b, v23.16b
-; CHECK-NEXT:    eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT:    eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT:    eor v6.16b, v7.16b, v16.16b
-; CHECK-NEXT:    mul v5.4s, v1.4s, v18.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v19.16b
-; CHECK-NEXT:    movi v18.4s, #32, lsl #8
-; CHECK-NEXT:    and v16.16b, v0.16b, v20.16b
-; CHECK-NEXT:    movi v19.4s, #1, lsl #16
-; CHECK-NEXT:    movi v20.4s, #4, lsl #8
-; CHECK-NEXT:    eor v3.16b, v3.16b, v4.16b
-; CHECK-NEXT:    and v4.16b, v0.16b, v21.16b
-; CHECK-NEXT:    eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT:    movi v17.4s, #2, lsl #16
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    eor v5.16b, v2.16b, v5.16b
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT:    movi v21.4s, #64, lsl #8
-; CHECK-NEXT:    mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT:    eor v2.16b, v3.16b, v6.16b
-; CHECK-NEXT:    and v3.16b, v0.16b, v19.16b
-; CHECK-NEXT:    movi v19.4s, #128, lsl #16
-; CHECK-NEXT:    and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT:    and v6.16b, v0.16b, v17.16b
-; CHECK-NEXT:    movi v17.4s, #64, lsl #16
-; CHECK-NEXT:    eor v5.16b, v5.16b, v7.16b
-; CHECK-NEXT:    mul v7.4s, v1.4s, v18.4s
-; CHECK-NEXT:    movi v18.4s, #4, lsl #16
-; CHECK-NEXT:    mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT:    eor v4.16b, v16.16b, v4.16b
-; CHECK-NEXT:    and v16.16b, v0.16b, v21.16b
-; CHECK-NEXT:    movi v21.4s, #1, lsl #24
-; CHECK-NEXT:    and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT:    mul v20.4s, v1.4s, v20.4s
-; CHECK-NEXT:    and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v7.16b
-; CHECK-NEXT:    and v7.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT:    and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT:    movi v18.4s, #128, lsl #8
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    eor v5.16b, v5.16b, v20.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #16
-; CHECK-NEXT:    mul v21.4s, v1.4s, v21.4s
-; CHECK-NEXT:    eor v4.16b, v4.16b, v16.16b
-; CHECK-NEXT:    and v20.16b, v0.16b, v24.16b
-; CHECK-NEXT:    movi v24.4s, #64, lsl #24
-; CHECK-NEXT:    eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT:    eor v16.16b, v17.16b, v19.16b
-; CHECK-NEXT:    and v17.16b, v0.16b, v18.16b
-; CHECK-NEXT:    mul v18.4s, v1.4s, v23.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT:    and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT:    and v19.16b, v0.16b, v25.16b
-; CHECK-NEXT:    movi v22.4s, #16, lsl #24
-; CHECK-NEXT:    movi v23.4s, #32, lsl #24
-; CHECK-NEXT:    eor v16.16b, v16.16b, v21.16b
-; CHECK-NEXT:    and v21.16b, v0.16b, v26.16b
-; CHECK-NEXT:    mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT:    mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT:    mul v5.4s, v1.4s, v20.4s
-; CHECK-NEXT:    mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT:    eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT:    eor v6.16b, v16.16b, v18.16b
-; CHECK-NEXT:    movi v16.4s, #128, lsl #24
-; CHECK-NEXT:    mul v18.4s, v1.4s, v21.4s
-; CHECK-NEXT:    and v20.16b, v0.16b, v22.16b
-; CHECK-NEXT:    and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT:    and v22.16b, v0.16b, v24.16b
-; CHECK-NEXT:    eor v4.16b, v4.16b, v17.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT:    eor v6.16b, v6.16b, v19.16b
-; CHECK-NEXT:    and v0.16b, v0.16b, v16.16b
-; CHECK-NEXT:    mul v7.4s, v1.4s, v20.4s
-; CHECK-NEXT:    mul v16.4s, v1.4s, v21.4s
-; CHECK-NEXT:    mul v17.4s, v1.4s, v22.4s
-; CHECK-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT:    eor v3.16b, v3.16b, v5.16b
-; CHECK-NEXT:    eor v4.16b, v6.16b, v18.16b
-; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT:    eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    rev32 v0.16b, v0.16b
+; CHECK-NEXT:    rbit v1.16b, v1.16b
+; CHECK-NEXT:    rbit v2.16b, v0.16b
+; CHECK-NEXT:    xtn v0.4h, v1.4s
+; CHECK-NEXT:    xtn v3.4h, v2.4s
+; CHECK-NEXT:    shrn v16.4h, v2.4s, #16
+; CHECK-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEXT:    rev16 v4.8b, v0.8b
+; CHECK-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEXT:    xtn v1.8b, v0.8h
+; CHECK-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEXT:    xtn v2.8b, v3.8h
+; CHECK-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEXT:    rbit v22.8b, v1.8b
+; CHECK-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEXT:    rbit v24.8b, v2.8b
+; CHECK-NEXT:    pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEXT:    pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEXT:    pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEXT:    pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEXT:    pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEXT:    eor v0.8b, v0.8b, v3.8b
+; CHECK-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEXT:    pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEXT:    pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEXT:    pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEXT:    eor v0.8b, v3.8b, v0.8b
+; CHECK-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    rev32 v0.16b, v0.16b
 ; CHECK-NEXT:    rbit v0.16b, v0.16b
 ; CHECK-NEXT:    ushr v0.4s, v0.4s, #1
@@ -5433,209 +4737,108 @@ define <4 x i32> @clmulh_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
 define <2 x i32> @clmulh_v2i32_neon(<2 x i32> %a, <2 x i32> %b) nounwind {
 ; CHECK-NEON-LABEL: clmulh_v2i32_neon:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    mov w8, #2 // =0x2
-; CHECK-NEON-NEXT:    mov w9, #1 // =0x1
-; CHECK-NEON-NEXT:    ushll v1.2d, v1.2s, #0
-; CHECK-NEON-NEXT:    dup v2.2d, x8
-; CHECK-NEON-NEXT:    dup v3.2d, x9
-; CHECK-NEON-NEXT:    mov w8, #4 // =0x4
-; CHECK-NEON-NEXT:    mov w9, #8 // =0x8
-; CHECK-NEON-NEXT:    dup v4.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16 // =0x10
-; CHECK-NEON-NEXT:    dup v5.2d, x9
-; CHECK-NEON-NEXT:    dup v6.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #32 // =0x20
-; CHECK-NEON-NEXT:    and v2.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    and v3.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    and v4.16b, v1.16b, v4.16b
-; CHECK-NEON-NEXT:    mov w8, #64 // =0x40
-; CHECK-NEON-NEXT:    mov w9, #512 // =0x200
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    xtn v2.2s, v2.2d
-; CHECK-NEON-NEXT:    xtn v3.2s, v3.2d
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT:    mov w8, #128 // =0x80
-; CHECK-NEON-NEXT:    xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    dup v17.2d, x8
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    mov w8, #256 // =0x100
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    umull v2.2d, v0.2s, v2.2s
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    mov w8, #2048 // =0x800
-; CHECK-NEON-NEXT:    and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v3.16b, v2.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT:    eor v3.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT:    xtn v4.2s, v16.2d
-; CHECK-NEON-NEXT:    dup v16.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4096 // =0x1000
-; CHECK-NEON-NEXT:    and v5.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x9
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8192 // =0x2000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v3.2s, v5.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1024 // =0x400
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    mov w8, #16384 // =0x4000
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #65536 // =0x10000
-; CHECK-NEON-NEXT:    umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #131072 // =0x20000
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    dup v6.2d, x8
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    mov w8, #32768 // =0x8000
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v17.16b, v3.16b
-; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #262144 // =0x40000
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #4194304 // =0x400000
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #8388608 // =0x800000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    dup v23.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #524288 // =0x80000
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT:    eor v4.16b, v3.16b, v17.16b
-; CHECK-NEON-NEXT:    movi v23.4s, #128, lsl #24
-; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v20.16b
-; CHECK-NEON-NEXT:    xtn v5.2s, v7.2d
-; CHECK-NEON-NEXT:    dup v7.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #16777216 // =0x1000000
-; CHECK-NEON-NEXT:    xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v21.2d
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #33554432 // =0x2000000
-; CHECK-NEON-NEXT:    eor v6.16b, v16.16b, v6.16b
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v18.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1048576 // =0x100000
-; CHECK-NEON-NEXT:    dup v21.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #2097152 // =0x200000
-; CHECK-NEON-NEXT:    and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    dup v22.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #67108864 // =0x4000000
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    fneg v23.2d, v23.2d
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v16.16b
-; CHECK-NEON-NEXT:    and v16.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT:    xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT:    xtn v18.2s, v20.2d
-; CHECK-NEON-NEXT:    dup v20.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #134217728 // =0x8000000
-; CHECK-NEON-NEXT:    and v21.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT:    umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT:    dup v19.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #268435456 // =0x10000000
-; CHECK-NEON-NEXT:    and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    dup v24.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #536870912 // =0x20000000
-; CHECK-NEON-NEXT:    dup v25.2d, x8
-; CHECK-NEON-NEXT:    mov w8, #1073741824 // =0x40000000
-; CHECK-NEON-NEXT:    and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT:    xtn v21.2s, v21.2d
-; CHECK-NEON-NEXT:    xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT:    dup v26.2d, x8
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    eor v6.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v17.16b, v17.16b, v18.16b
-; CHECK-NEON-NEXT:    xtn v18.2s, v22.2d
-; CHECK-NEON-NEXT:    xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT:    and v22.16b, v1.16b, v24.16b
-; CHECK-NEON-NEXT:    and v24.16b, v1.16b, v25.16b
-; CHECK-NEON-NEXT:    and v25.16b, v1.16b, v26.16b
-; CHECK-NEON-NEXT:    umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT:    umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT:    and v1.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT:    eor v7.16b, v17.16b, v16.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v3.16b, v5.16b
-; CHECK-NEON-NEXT:    xtn v16.2s, v22.2d
-; CHECK-NEON-NEXT:    xtn v17.2s, v24.2d
-; CHECK-NEON-NEXT:    xtn v22.2s, v25.2d
-; CHECK-NEON-NEXT:    umull v4.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT:    umull v18.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT:    xtn v1.2s, v1.2d
-; CHECK-NEON-NEXT:    eor v5.16b, v6.16b, v21.16b
-; CHECK-NEON-NEXT:    eor v6.16b, v7.16b, v20.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    umull v7.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT:    umull v16.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT:    umull v17.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT:    eor v3.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT:    eor v4.16b, v6.16b, v18.16b
-; CHECK-NEON-NEXT:    umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT:    eor v1.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT:    eor v2.16b, v4.16b, v7.16b
-; CHECK-NEON-NEXT:    eor v3.16b, v16.16b, v17.16b
-; CHECK-NEON-NEXT:    eor v1.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT:    eor v0.16b, v3.16b, v0.16b
-; CHECK-NEON-NEXT:    eor v0.16b, v1.16b, v0.16b
-; CHECK-NEON-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-NEON-NEXT:    rev32 v1.8b, v1.8b
+; CHECK-NEON-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT:    rbit v1.8b, v1.8b
+; CHECK-NEON-NEXT:    rbit v2.8b, v0.8b
+; CHECK-NEON-NEXT:    xtn v0.4h, v1.4s
+; CHECK-NEON-NEXT:    xtn v3.4h, v2.4s
+; CHECK-NEON-NEXT:    shrn v16.4h, v2.4s, #16
+; CHECK-NEON-NEXT:    shrn v17.4h, v1.4s, #16
+; CHECK-NEON-NEXT:    xtn v20.8b, v16.8h
+; CHECK-NEON-NEXT:    shrn v16.8b, v16.8h, #8
+; CHECK-NEON-NEXT:    rev16 v4.8b, v0.8b
+; CHECK-NEON-NEXT:    rev16 v5.8b, v3.8b
+; CHECK-NEON-NEXT:    xtn v1.8b, v0.8h
+; CHECK-NEON-NEXT:    xtn v21.8b, v17.8h
+; CHECK-NEON-NEXT:    xtn v2.8b, v3.8h
+; CHECK-NEON-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-NEON-NEXT:    shrn v3.8b, v3.8h, #8
+; CHECK-NEON-NEXT:    shrn v17.8b, v17.8h, #8
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    rbit v5.8b, v5.8b
+; CHECK-NEON-NEXT:    rbit v22.8b, v1.8b
+; CHECK-NEON-NEXT:    rbit v23.8b, v21.8b
+; CHECK-NEON-NEXT:    rbit v24.8b, v2.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEON-NEXT:    pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEON-NEXT:    pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEON-NEXT:    xtn v6.8b, v4.8h
+; CHECK-NEON-NEXT:    xtn v7.8b, v5.8h
+; CHECK-NEON-NEXT:    shrn v5.8b, v5.8h, #8
+; CHECK-NEON-NEXT:    shrn v4.8b, v4.8h, #8
+; CHECK-NEON-NEXT:    pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEON-NEXT:    rbit v18.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v7.8b
+; CHECK-NEON-NEXT:    pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT:    pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEON-NEXT:    pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT:    rbit v7.8b, v23.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEON-NEXT:    rbit v19.8b, v20.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT:    ushll v6.8h, v6.8b, #0
+; CHECK-NEON-NEXT:    ushr v7.8b, v7.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v18.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v5.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    rbit v18.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEON-NEXT:    eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v5.8b, v25.8b, v16.8b
+; CHECK-NEON-NEXT:    eor v16.8b, v17.8b, v19.8b
+; CHECK-NEON-NEXT:    pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEON-NEXT:    ushr v18.8b, v18.8b, #1
+; CHECK-NEON-NEXT:    eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT:    shll v4.8h, v4.8b, #8
+; CHECK-NEON-NEXT:    eor v5.8b, v18.8b, v5.8b
+; CHECK-NEON-NEXT:    pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEON-NEXT:    eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT:    pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEON-NEXT:    pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEON-NEXT:    orr v4.16b, v6.16b, v4.16b
+; CHECK-NEON-NEXT:    rbit v6.8b, v17.8b
+; CHECK-NEON-NEXT:    shll v5.8h, v5.8b, #8
+; CHECK-NEON-NEXT:    shll v7.8h, v7.8b, #8
+; CHECK-NEON-NEXT:    ushll v17.8h, v18.8b, #0
+; CHECK-NEON-NEXT:    rev16 v4.8b, v4.8b
+; CHECK-NEON-NEXT:    ushll v16.8h, v16.8b, #0
+; CHECK-NEON-NEXT:    ushll v1.8h, v1.8b, #0
+; CHECK-NEON-NEXT:    ushr v3.8b, v6.8b, #1
+; CHECK-NEON-NEXT:    orr v5.16b, v17.16b, v5.16b
+; CHECK-NEON-NEXT:    orr v6.16b, v16.16b, v7.16b
+; CHECK-NEON-NEXT:    rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT:    eor v0.8b, v3.8b, v0.8b
+; CHECK-NEON-NEXT:    eor v2.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT:    shll v0.8h, v0.8b, #8
+; CHECK-NEON-NEXT:    ushr v3.4h, v4.4h, #1
+; CHECK-NEON-NEXT:    orr v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT:    eor v2.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-NEON-NEXT:    shll v1.4s, v2.4h, #16
+; CHECK-NEON-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT:    rbit v0.8b, v0.8b
+; CHECK-NEON-NEXT:    ushr v0.2s, v0.2s, #1
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-AES-LABEL: clmulh_v2i32_neon:
 ; CHECK-AES:       // %bb.0:
+; CHECK-AES-NEXT:    rev32 v1.8b, v1.8b
+; CHECK-AES-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT:    rbit v1.8b, v1.8b
+; CHECK-AES-NEXT:    rbit v0.8b, v0.8b
 ; CHECK-AES-NEXT:    ushll v1.2d, v1.2s, #0
 ; CHECK-AES-NEXT:    ushll v0.2d, v0.2s, #0
 ; CHECK-AES-NEXT:    pmull2 v2.1q, v0.2d, v1.2d
 ; CHECK-AES-NEXT:    pmull v0.1q, v0.1d, v1.1d
 ; CHECK-AES-NEXT:    mov v0.d[1], v2.d[0]
-; CHECK-AES-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-AES-NEXT:    xtn v0.2s, v0.2d
+; CHECK-AES-NEXT:    rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT:    rbit v0.8b, v0.8b
+; CHECK-AES-NEXT:    ushr v0.2s, v0.2s, #1
 ; CHECK-AES-NEXT:    ret
   %a.ext = zext <2 x i32> %a to <2 x i64>
   %b.ext = zext <2 x i32> %b to <2 x i64>
diff --git a/llvm/test/CodeGen/PowerPC/clmul-vector.ll b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
index f57dbeade4805..1bebca731bc2d 100644
--- a/llvm/test/CodeGen/PowerPC/clmul-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
@@ -8779,7 +8779,7 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; LE-LABEL: clmulh_v2i64:
 ; LE:       # %bb.0:
-; LE-NEXT:    stdu 1, -736(1)
+; LE-NEXT:    stdu 1, -752(1)
 ; LE-NEXT:    lis 4, -21846
 ; LE-NEXT:    lis 5, 21845
 ; LE-NEXT:    xxswapd 1, 35
@@ -8792,8 +8792,8 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    ori 5, 5, 21845
 ; LE-NEXT:    mffprd 8, 1
 ; LE-NEXT:    mffprd 10, 0
-; LE-NEXT:    std 28, 704(1) # 8-byte Folded Spill
-; LE-NEXT:    std 29, 712(1) # 8-byte Folded Spill
+; LE-NEXT:    std 28, 720(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 728(1) # 8-byte Folded Spill
 ; LE-NEXT:    ori 6, 6, 52428
 ; LE-NEXT:    ori 7, 7, 13107
 ; LE-NEXT:    sldi 4, 4, 32
@@ -8802,7 +8802,7 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    sldi 7, 7, 32
 ; LE-NEXT:    sldi 11, 3, 1
 ; LE-NEXT:    rldicl 3, 3, 63, 1
-; LE-NEXT:    std 30, 720(1) # 8-byte Folded Spill
+; LE-NEXT:    std 30, 736(1) # 8-byte Folded Spill
 ; LE-NEXT:    lis 0, -3856
 ; LE-NEXT:    oris 4, 4, 43690
 ; LE-NEXT:    oris 5, 5, 21845
@@ -8811,48 +8811,50 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    sldi 12, 10, 1
 ; LE-NEXT:    rldicl 10, 10, 63, 1
 ; LE-NEXT:    oris 7, 7, 13107
-; LE-NEXT:    std 27, 696(1) # 8-byte Folded Spill
+; LE-NEXT:    std 27, 712(1) # 8-byte Folded Spill
 ; LE-NEXT:    ori 28, 4, 43690
 ; LE-NEXT:    ori 29, 5, 21845
-; LE-NEXT:    std 14, 592(1) # 8-byte Folded Spill
-; LE-NEXT:    std 15, 600(1) # 8-byte Folded Spill
+; LE-NEXT:    std 14, 608(1) # 8-byte Folded Spill
+; LE-NEXT:    std 15, 616(1) # 8-byte Folded Spill
 ; LE-NEXT:    sldi 4, 8, 1
 ; LE-NEXT:    rldicl 5, 8, 63, 1
-; LE-NEXT:    std 16, 608(1) # 8-byte Folded Spill
-; LE-NEXT:    std 17, 616(1) # 8-byte Folded Spill
+; LE-NEXT:    std 16, 624(1) # 8-byte Folded Spill
+; LE-NEXT:    std 17, 632(1) # 8-byte Folded Spill
 ; LE-NEXT:    sldi 8, 9, 1
 ; LE-NEXT:    rldicl 9, 9, 63, 1
-; LE-NEXT:    std 28, 568(1) # 8-byte Folded Spill
-; LE-NEXT:    std 29, 576(1) # 8-byte Folded Spill
+; LE-NEXT:    std 28, 584(1) # 8-byte Folded Spill
+; LE-NEXT:    std 29, 592(1) # 8-byte Folded Spill
 ; LE-NEXT:    and 11, 11, 28
 ; LE-NEXT:    and 3, 3, 29
-; LE-NEXT:    std 18, 624(1) # 8-byte Folded Spill
-; LE-NEXT:    std 19, 632(1) # 8-byte Folded Spill
+; LE-NEXT:    std 18, 640(1) # 8-byte Folded Spill
+; LE-NEXT:    std 19, 648(1) # 8-byte Folded Spill
 ; LE-NEXT:    and 4, 4, 28
 ; LE-NEXT:    and 5, 5, 29
-; LE-NEXT:    std 20, 640(1) # 8-byte Folded Spill
-; LE-NEXT:    std 21, 648(1) # 8-byte Folded Spill
+; LE-NEXT:    std 20, 656(1) # 8-byte Folded Spill
+; LE-NEXT:    std 21, 664(1) # 8-byte Folded Spill
 ; LE-NEXT:    and 8, 8, 28
 ; LE-NEXT:    and 9, 9, 29
-; LE-NEXT:    std 22, 656(1) # 8-byte Folded Spill
-; LE-NEXT:    std 23, 664(1) # 8-byte Folded Spill
+; LE-NEXT:    std 22, 672(1) # 8-byte Folded Spill
+; LE-NEXT:    std 23, 680(1) # 8-byte Folded Spill
 ; LE-NEXT:    and 12, 12, 28
 ; LE-NEXT:    and 10, 10, 29
-; LE-NEXT:    std 24, 672(1) # 8-byte Folded Spill
-; LE-NEXT:    std 25, 680(1) # 8-byte Folded Spill
+; LE-NEXT:    std 24, 688(1) # 8-byte Folded Spill
+; LE-NEXT:    std 25, 696(1) # 8-byte Folded Spill
 ; LE-NEXT:    or 3, 3, 11
 ; LE-NEXT:    or 4, 5, 4
-; LE-NEXT:    std 26, 688(1) # 8-byte Folded Spill
-; LE-NEXT:    std 31, 728(1) # 8-byte Folded Spill
+; LE-NEXT:    std 26, 704(1) # 8-byte Folded Spill
+; LE-NEXT:    std 31, 744(1) # 8-byte Folded Spill
 ; LE-NEXT:    ori 5, 0, 61680
 ; LE-NEXT:    ori 11, 30, 3855
-; LE-NEXT:    std 2, 584(1) # 8-byte Folded Spill
+; LE-NEXT:    std 2, 600(1) # 8-byte Folded Spill
+; LE-NEXT:    vspltisw 2, 1
 ; LE-NEXT:    ori 30, 6, 52428
 ; LE-NEXT:    ori 0, 7, 13107
-; LE-NEXT:    std 30, 552(1) # 8-byte Folded Spill
-; LE-NEXT:    std 0, 560(1) # 8-byte Folded Spill
+; LE-NEXT:    std 30, 568(1) # 8-byte Folded Spill
+; LE-NEXT:    std 0, 576(1) # 8-byte Folded Spill
 ; LE-NEXT:    or 6, 9, 8
 ; LE-NEXT:    or 7, 10, 12
+; LE-NEXT:    vupklsw 2, 2
 ; LE-NEXT:    sldi 8, 3, 2
 ; LE-NEXT:    rldicl 3, 3, 62, 2
 ; LE-NEXT:    sldi 9, 4, 2
@@ -8876,9 +8878,9 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    or 3, 3, 8
 ; LE-NEXT:    or 4, 4, 9
 ; LE-NEXT:    ori 30, 5, 61680
-; LE-NEXT:    std 30, 536(1) # 8-byte Folded Spill
+; LE-NEXT:    std 30, 552(1) # 8-byte Folded Spill
 ; LE-NEXT:    ori 0, 10, 3855
-; LE-NEXT:    std 0, 544(1) # 8-byte Folded Spill
+; LE-NEXT:    std 0, 560(1) # 8-byte Folded Spill
 ; LE-NEXT:    or 5, 6, 11
 ; LE-NEXT:    or 6, 7, 12
 ; LE-NEXT:    sldi 7, 3, 4
@@ -8929,167 +8931,172 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    rlwimi 4, 11, 8, 24, 31
 ; LE-NEXT:    or 10, 5, 4
 ; LE-NEXT:    rlwinm 4, 3, 0, 30, 30
-; LE-NEXT:    std 4, 528(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 544(1) # 8-byte Folded Spill
 ; LE-NEXT:    rlwinm 4, 3, 0, 5, 5
-; LE-NEXT:    std 4, 376(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 384(1) # 8-byte Folded Spill
 ; LE-NEXT:    rlwinm 4, 3, 0, 4, 4
-; LE-NEXT:    std 4, 368(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 376(1) # 8-byte Folded Spill
 ; LE-NEXT:    rlwinm 4, 3, 0, 3, 3
-; LE-NEXT:    std 4, 360(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 368(1) # 8-byte Folded Spill
 ; LE-NEXT:    rlwinm 4, 3, 0, 2, 2
-; LE-NEXT:    std 4, 352(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 360(1) # 8-byte Folded Spill
 ; LE-NEXT:    rlwinm 4, 3, 0, 1, 1
-; LE-NEXT:    std 4, 344(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 352(1) # 8-byte Folded Spill
 ; LE-NEXT:    rlwinm 4, 3, 0, 0, 0
-; LE-NEXT:    std 4, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 344(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 32, 32
-; LE-NEXT:    std 4, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 336(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 31, 33
-; LE-NEXT:    std 4, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 280(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 30, 34
-; LE-NEXT:    std 4, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 272(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 29, 35
-; LE-NEXT:    std 4, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 264(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 28, 36
-; LE-NEXT:    std 4, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 256(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 27, 37
-; LE-NEXT:    std 4, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 248(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 26, 38
-; LE-NEXT:    std 4, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 240(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 25, 39
-; LE-NEXT:    std 4, 216(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 232(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 24, 40
 ; LE-NEXT:    rldicl 0, 6, 32, 32
 ; LE-NEXT:    rotlwi 30, 6, 24
 ; LE-NEXT:    rotlwi 27, 0, 24
-; LE-NEXT:    std 4, 208(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 224(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 23, 41
 ; LE-NEXT:    rlwimi 30, 6, 8, 8, 15
 ; LE-NEXT:    rlwimi 30, 6, 8, 24, 31
 ; LE-NEXT:    rlwimi 27, 0, 8, 8, 15
-; LE-NEXT:    std 4, 200(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 216(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 22, 42
 ; LE-NEXT:    sldi 6, 30, 32
 ; LE-NEXT:    rlwimi 27, 0, 8, 24, 31
 ; LE-NEXT:    or 11, 6, 27
-; LE-NEXT:    std 4, 192(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 208(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 21, 43
 ; LE-NEXT:    clrldi 5, 3, 63
 ; LE-NEXT:    rlwinm 6, 3, 0, 29, 29
 ; LE-NEXT:    rlwinm 7, 3, 0, 28, 28
-; LE-NEXT:    std 4, 184(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 200(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 20, 44
 ; LE-NEXT:    rlwinm 8, 3, 0, 27, 27
 ; LE-NEXT:    rlwinm 12, 3, 0, 26, 26
 ; LE-NEXT:    rlwinm 0, 3, 0, 25, 25
-; LE-NEXT:    std 4, 176(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 192(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 19, 45
 ; LE-NEXT:    rlwinm 30, 3, 0, 24, 24
 ; LE-NEXT:    rlwinm 29, 3, 0, 23, 23
 ; LE-NEXT:    rlwinm 28, 3, 0, 22, 22
-; LE-NEXT:    std 4, 168(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 184(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 18, 46
 ; LE-NEXT:    rlwinm 27, 3, 0, 21, 21
 ; LE-NEXT:    rlwinm 26, 3, 0, 20, 20
 ; LE-NEXT:    rlwinm 25, 3, 0, 19, 19
-; LE-NEXT:    std 4, 160(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 176(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 17, 47
 ; LE-NEXT:    rlwinm 24, 3, 0, 18, 18
 ; LE-NEXT:    rlwinm 23, 3, 0, 17, 17
 ; LE-NEXT:    rlwinm 22, 3, 0, 16, 16
-; LE-NEXT:    std 4, 152(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 168(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 16, 48
 ; LE-NEXT:    rlwinm 21, 3, 0, 15, 15
 ; LE-NEXT:    rlwinm 20, 3, 0, 14, 14
 ; LE-NEXT:    rlwinm 19, 3, 0, 13, 13
-; LE-NEXT:    std 4, 144(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 160(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 15, 49
 ; LE-NEXT:    rlwinm 18, 3, 0, 12, 12
 ; LE-NEXT:    rlwinm 17, 3, 0, 11, 11
 ; LE-NEXT:    rlwinm 16, 3, 0, 10, 10
-; LE-NEXT:    std 4, 136(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 152(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 14, 50
 ; LE-NEXT:    rlwinm 15, 3, 0, 9, 9
 ; LE-NEXT:    rlwinm 14, 3, 0, 8, 8
 ; LE-NEXT:    rlwinm 31, 3, 0, 7, 7
-; LE-NEXT:    std 4, 128(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 144(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 13, 51
 ; LE-NEXT:    rlwinm 2, 3, 0, 6, 6
-; LE-NEXT:    std 4, 120(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 136(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 12, 52
-; LE-NEXT:    std 4, 112(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 128(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 11, 53
-; LE-NEXT:    std 4, 104(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 120(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 10, 54
-; LE-NEXT:    std 4, 96(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 112(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 9, 55
-; LE-NEXT:    std 4, 88(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 104(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 8, 56
-; LE-NEXT:    std 4, 80(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 96(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 7, 57
-; LE-NEXT:    std 4, 72(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 88(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 6, 58
-; LE-NEXT:    std 4, 64(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 80(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 5, 59
-; LE-NEXT:    std 4, 56(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 72(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 4, 60
-; LE-NEXT:    std 4, 48(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 64(1) # 8-byte Folded Spill
 ; LE-NEXT:    rldicl 4, 3, 3, 61
-; LE-NEXT:    rldicl 3, 3, 2, 62
-; LE-NEXT:    std 3, 32(1) # 8-byte Folded Spill
-; LE-NEXT:    ld 3, 528(1) # 8-byte Folded Reload
-; LE-NEXT:    std 4, 40(1) # 8-byte Folded Spill
+; LE-NEXT:    std 4, 56(1) # 8-byte Folded Spill
+; LE-NEXT:    rldicl 4, 3, 2, 62
+; LE-NEXT:    rldicr 3, 3, 0, 0
+; LE-NEXT:    std 3, 40(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 544(1) # 8-byte Folded Reload
+; LE-NEXT:    std 4, 48(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 3
-; LE-NEXT:    std 3, 288(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 296(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 5
-; LE-NEXT:    std 3, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 288(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 6
-; LE-NEXT:    std 3, 296(1) # 8-byte Folded Spill
-; LE-NEXT:    mulld 3, 11, 7
 ; LE-NEXT:    std 3, 304(1) # 8-byte Folded Spill
-; LE-NEXT:    mulld 3, 11, 8
+; LE-NEXT:    mulld 3, 11, 7
 ; LE-NEXT:    std 3, 312(1) # 8-byte Folded Spill
-; LE-NEXT:    mulld 3, 11, 12
+; LE-NEXT:    mulld 3, 11, 8
 ; LE-NEXT:    std 3, 320(1) # 8-byte Folded Spill
-; LE-NEXT:    mulld 3, 11, 0
+; LE-NEXT:    mulld 3, 11, 12
 ; LE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 0
+; LE-NEXT:    std 3, 544(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 30
-; LE-NEXT:    std 3, 528(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 536(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 29
-; LE-NEXT:    std 3, 520(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 528(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 28
-; LE-NEXT:    std 3, 512(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 520(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 27
-; LE-NEXT:    std 3, 504(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 512(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 26
-; LE-NEXT:    std 3, 496(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 504(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 25
-; LE-NEXT:    std 3, 488(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 496(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 24
-; LE-NEXT:    std 3, 480(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 488(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 23
-; LE-NEXT:    std 3, 472(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 480(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 22
-; LE-NEXT:    std 3, 464(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 472(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 21
-; LE-NEXT:    std 3, 456(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 464(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 20
-; LE-NEXT:    std 3, 448(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 456(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 19
-; LE-NEXT:    std 3, 440(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 448(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 18
-; LE-NEXT:    std 3, 432(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 440(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 17
-; LE-NEXT:    std 3, 424(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 432(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 16
-; LE-NEXT:    std 3, 416(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 424(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 15
-; LE-NEXT:    std 3, 408(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 416(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 14
-; LE-NEXT:    std 3, 400(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 408(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 31
-; LE-NEXT:    std 3, 392(1) # 8-byte Folded Spill
+; LE-NEXT:    std 3, 400(1) # 8-byte Folded Spill
 ; LE-NEXT:    mulld 3, 11, 2
+; LE-NEXT:    std 3, 392(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 384(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
 ; LE-NEXT:    std 3, 384(1) # 8-byte Folded Spill
 ; LE-NEXT:    ld 3, 376(1) # 8-byte Folded Reload
 ; LE-NEXT:    mulld 3, 11, 3
@@ -9107,109 +9114,112 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    mulld 3, 11, 3
 ; LE-NEXT:    std 3, 344(1) # 8-byte Folded Spill
 ; LE-NEXT:    ld 3, 336(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 3, 11, 3
-; LE-NEXT:    std 3, 336(1) # 8-byte Folded Spill
-; LE-NEXT:    ld 3, 272(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 4, 3, 32, 31
-; LE-NEXT:    ld 3, 264(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 4, 11, 4
+; LE-NEXT:    ld 3, 280(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 5, 3, 33, 30
-; LE-NEXT:    ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 3, 272(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 6, 3, 34, 29
-; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
-; LE-NEXT:    std 4, 272(1) # 8-byte Folded Spill
-; LE-NEXT:    mulld 4, 11, 5
-; LE-NEXT:    ld 5, 280(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 3, 264(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 7, 3, 35, 28
-; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 3, 256(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 8, 3, 36, 27
-; LE-NEXT:    ld 3, 232(1) # 8-byte Folded Reload
-; LE-NEXT:    std 4, 264(1) # 8-byte Folded Spill
-; LE-NEXT:    mulld 4, 11, 6
-; LE-NEXT:    mulld 6, 11, 7
-; LE-NEXT:    mulld 7, 11, 8
+; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 12, 3, 37, 26
-; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 8, 11, 12
-; LE-NEXT:    std 4, 256(1) # 8-byte Folded Spill
-; LE-NEXT:    clrldi 4, 9, 63
+; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 0, 3, 38, 25
-; LE-NEXT:    ld 3, 216(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 4, 10, 4
-; LE-NEXT:    mulld 12, 11, 0
+; LE-NEXT:    ld 3, 232(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 30, 3, 39, 24
-; LE-NEXT:    ld 3, 208(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 0, 11, 30
+; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 29, 3, 40, 23
-; LE-NEXT:    ld 3, 200(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 30, 11, 29
+; LE-NEXT:    ld 3, 216(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 28, 3, 41, 22
-; LE-NEXT:    ld 3, 192(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 29, 11, 28
+; LE-NEXT:    ld 3, 208(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 27, 3, 42, 21
-; LE-NEXT:    ld 3, 184(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 28, 11, 27
+; LE-NEXT:    ld 3, 200(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 26, 3, 43, 20
-; LE-NEXT:    ld 3, 176(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 27, 11, 26
+; LE-NEXT:    ld 3, 192(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 25, 3, 44, 19
-; LE-NEXT:    ld 3, 168(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 26, 11, 25
+; LE-NEXT:    ld 3, 184(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 24, 3, 45, 18
-; LE-NEXT:    ld 3, 160(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 25, 11, 24
+; LE-NEXT:    ld 3, 176(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 23, 3, 46, 17
-; LE-NEXT:    ld 3, 152(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 24, 11, 23
+; LE-NEXT:    ld 3, 168(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 22, 3, 47, 16
-; LE-NEXT:    ld 3, 144(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 23, 11, 22
+; LE-NEXT:    ld 3, 160(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 21, 3, 48, 15
-; LE-NEXT:    ld 3, 136(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 22, 11, 21
+; LE-NEXT:    ld 3, 152(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 20, 3, 49, 14
-; LE-NEXT:    ld 3, 128(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 21, 11, 20
+; LE-NEXT:    ld 3, 144(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 19, 3, 50, 13
-; LE-NEXT:    ld 3, 120(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 20, 11, 19
+; LE-NEXT:    ld 3, 136(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 18, 3, 51, 12
-; LE-NEXT:    ld 3, 112(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 19, 11, 18
+; LE-NEXT:    ld 3, 128(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 17, 3, 52, 11
-; LE-NEXT:    ld 3, 104(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 18, 11, 17
+; LE-NEXT:    ld 3, 120(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 16, 3, 53, 10
-; LE-NEXT:    ld 3, 96(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 17, 11, 16
+; LE-NEXT:    ld 3, 112(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 15, 3, 54, 9
-; LE-NEXT:    ld 3, 88(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 16, 11, 15
+; LE-NEXT:    ld 3, 104(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 14, 3, 55, 8
-; LE-NEXT:    ld 3, 80(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 15, 11, 14
+; LE-NEXT:    ld 3, 96(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 31, 3, 56, 7
-; LE-NEXT:    ld 3, 72(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 14, 11, 31
+; LE-NEXT:    ld 3, 88(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 2, 3, 57, 6
-; LE-NEXT:    ld 3, 64(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 31, 11, 2
+; LE-NEXT:    ld 3, 80(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 3, 3, 58, 5
-; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
-; LE-NEXT:    ld 3, 56(1) # 8-byte Folded Reload
+; LE-NEXT:    std 3, 256(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 72(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 3, 3, 59, 4
-; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
-; LE-NEXT:    ld 3, 48(1) # 8-byte Folded Reload
+; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 64(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 3, 3, 60, 3
-; LE-NEXT:    std 3, 232(1) # 8-byte Folded Spill
-; LE-NEXT:    ld 3, 40(1) # 8-byte Folded Reload
+; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 56(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 3, 3, 61, 2
-; LE-NEXT:    std 3, 224(1) # 8-byte Folded Spill
-; LE-NEXT:    ld 3, 32(1) # 8-byte Folded Reload
+; LE-NEXT:    std 3, 232(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 48(1) # 8-byte Folded Reload
 ; LE-NEXT:    rldicl 3, 3, 62, 1
-; LE-NEXT:    std 3, 216(1) # 8-byte Folded Spill
-; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    std 3, 224(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 40(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 336(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 4
+; LE-NEXT:    clrldi 4, 9, 63
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    std 3, 280(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 5
+; LE-NEXT:    ld 5, 288(1) # 8-byte Folded Reload
+; LE-NEXT:    std 3, 272(1) # 8-byte Folded Spill
+; LE-NEXT:    mulld 3, 11, 6
+; LE-NEXT:    mulld 6, 11, 7
+; LE-NEXT:    mulld 7, 11, 8
+; LE-NEXT:    mulld 8, 11, 12
+; LE-NEXT:    mulld 12, 11, 0
+; LE-NEXT:    mulld 0, 11, 30
+; LE-NEXT:    mulld 30, 11, 29
+; LE-NEXT:    mulld 29, 11, 28
+; LE-NEXT:    mulld 28, 11, 27
+; LE-NEXT:    mulld 27, 11, 26
+; LE-NEXT:    mulld 26, 11, 25
+; LE-NEXT:    mulld 25, 11, 24
+; LE-NEXT:    mulld 24, 11, 23
+; LE-NEXT:    mulld 23, 11, 22
+; LE-NEXT:    mulld 22, 11, 21
+; LE-NEXT:    mulld 21, 11, 20
+; LE-NEXT:    mulld 20, 11, 19
+; LE-NEXT:    mulld 19, 11, 18
+; LE-NEXT:    mulld 18, 11, 17
+; LE-NEXT:    mulld 17, 11, 16
+; LE-NEXT:    mulld 16, 11, 15
+; LE-NEXT:    mulld 15, 11, 14
+; LE-NEXT:    mulld 14, 11, 31
+; LE-NEXT:    mulld 31, 11, 2
+; LE-NEXT:    std 3, 264(1) # 8-byte Folded Spill
+; LE-NEXT:    ld 3, 256(1) # 8-byte Folded Reload
 ; LE-NEXT:    mulld 2, 11, 3
+; LE-NEXT:    ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT:    mulld 3, 11, 3
+; LE-NEXT:    std 3, 256(1) # 8-byte Folded Spill
 ; LE-NEXT:    ld 3, 240(1) # 8-byte Folded Reload
 ; LE-NEXT:    mulld 3, 11, 3
 ; LE-NEXT:    std 3, 248(1) # 8-byte Folded Spill
@@ -9217,46 +9227,42 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    mulld 3, 11, 3
 ; LE-NEXT:    std 3, 240(1) # 8-byte Folded Spill
 ; LE-NEXT:    ld 3, 224(1) # 8-byte Folded Reload
-; LE-NEXT:    mulld 3, 11, 3
-; LE-NEXT:    std 3, 232(1) # 8-byte Folded Spill
-; LE-NEXT:    ld 3, 216(1) # 8-byte Folded Reload
 ; LE-NEXT:    mulld 11, 11, 3
 ; LE-NEXT:    rlwinm 3, 9, 0, 30, 30
 ; LE-NEXT:    mulld 3, 10, 3
 ; LE-NEXT:    xor 3, 4, 3
-; LE-NEXT:    ld 4, 288(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 4, 296(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 4, 5, 4
 ; LE-NEXT:    rlwinm 5, 9, 0, 29, 29
 ; LE-NEXT:    mulld 5, 10, 5
 ; LE-NEXT:    xor 3, 3, 5
-; LE-NEXT:    ld 5, 296(1) # 8-byte Folded Reload
-; LE-NEXT:    xor 4, 4, 5
-; LE-NEXT:    rlwinm 5, 9, 0, 28, 28
-; LE-NEXT:    mulld 5, 10, 5
-; LE-NEXT:    xor 3, 3, 5
 ; LE-NEXT:    ld 5, 304(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 4, 4, 5
-; LE-NEXT:    rlwinm 5, 9, 0, 27, 27
+; LE-NEXT:    rlwinm 5, 9, 0, 28, 28
 ; LE-NEXT:    mulld 5, 10, 5
 ; LE-NEXT:    xor 3, 3, 5
 ; LE-NEXT:    ld 5, 312(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 4, 4, 5
-; LE-NEXT:    rlwinm 5, 9, 0, 26, 26
+; LE-NEXT:    rlwinm 5, 9, 0, 27, 27
 ; LE-NEXT:    mulld 5, 10, 5
 ; LE-NEXT:    xor 3, 3, 5
 ; LE-NEXT:    ld 5, 320(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 4, 4, 5
-; LE-NEXT:    rlwinm 5, 9, 0, 25, 25
+; LE-NEXT:    rlwinm 5, 9, 0, 26, 26
 ; LE-NEXT:    mulld 5, 10, 5
 ; LE-NEXT:    xor 3, 3, 5
 ; LE-NEXT:    ld 5, 328(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 4, 4, 5
-; LE-NEXT:    rlwinm 5, 9, 0, 24, 24
+; LE-NEXT:    rlwinm 5, 9, 0, 25, 25
 ; LE-NEXT:    mulld 5, 10, 5
 ; LE-NEXT:    xor 3, 3, 5
 ; LE-NEXT:    std 3, 328(1) # 8-byte Folded Spill
-; LE-NEXT:    ld 3, 528(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 3, 544(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    ld 4, 536(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 528(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
 ; LE-NEXT:    ld 4, 520(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 4
 ; LE-NEXT:    ld 4, 512(1) # 8-byte Folded Reload
@@ -9303,70 +9309,70 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    xor 3, 3, 4
 ; LE-NEXT:    ld 4, 344(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    ld 4, 336(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 4, 280(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 4
 ; LE-NEXT:    ld 4, 272(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 4
 ; LE-NEXT:    ld 4, 264(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 4
 ; LE-NEXT:    ld 4, 256(1) # 8-byte Folded Reload
-; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    ld 4, 248(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 6
-; LE-NEXT:    ld 6, 576(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 6, 592(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 7
-; LE-NEXT:    ld 7, 568(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 7, 584(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 8
-; LE-NEXT:    ld 8, 560(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 8, 576(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 12
-; LE-NEXT:    ld 12, 544(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 12, 560(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 0
-; LE-NEXT:    ld 0, 536(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 0, 552(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 30
-; LE-NEXT:    ld 30, 720(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 30, 736(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 29
-; LE-NEXT:    ld 29, 712(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 29, 728(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 28
-; LE-NEXT:    ld 28, 704(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 28, 720(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 27
-; LE-NEXT:    ld 27, 696(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 27, 712(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 26
-; LE-NEXT:    ld 26, 688(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 26, 704(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 25
-; LE-NEXT:    ld 25, 680(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 25, 696(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 24
-; LE-NEXT:    ld 24, 672(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 24, 688(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 23
-; LE-NEXT:    ld 23, 664(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 23, 680(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 22
-; LE-NEXT:    ld 22, 656(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 22, 672(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 21
-; LE-NEXT:    ld 21, 648(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 21, 664(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 20
-; LE-NEXT:    ld 20, 640(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 20, 656(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 19
-; LE-NEXT:    ld 19, 632(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 19, 648(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 18
-; LE-NEXT:    ld 18, 624(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 18, 640(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 17
-; LE-NEXT:    ld 17, 616(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 17, 632(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 16
-; LE-NEXT:    ld 16, 608(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 16, 624(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 15
-; LE-NEXT:    ld 15, 600(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 15, 616(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 14
-; LE-NEXT:    ld 14, 592(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 14, 608(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 31
-; LE-NEXT:    ld 31, 728(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 31, 744(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 2
-; LE-NEXT:    ld 2, 584(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 2, 600(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    ld 4, 240(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 4, 248(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 4
-; LE-NEXT:    ld 4, 232(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 4, 240(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    ld 4, 336(1) # 8-byte Folded Reload
 ; LE-NEXT:    xor 3, 3, 11
-; LE-NEXT:    ld 11, 552(1) # 8-byte Folded Reload
+; LE-NEXT:    ld 11, 568(1) # 8-byte Folded Reload
+; LE-NEXT:    xor 3, 3, 4
 ; LE-NEXT:    sldi 4, 3, 1
 ; LE-NEXT:    rldicl 3, 3, 63, 1
 ; LE-NEXT:    and 4, 4, 7
@@ -9392,11 +9398,13 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    sldi 4, 5, 32
 ; LE-NEXT:    or 3, 4, 3
 ; LE-NEXT:    ld 4, 328(1) # 8-byte Folded Reload
-; LE-NEXT:    rldicl 3, 3, 63, 1
 ; LE-NEXT:    mtfprd 0, 3
-; LE-NEXT:    rlwinm 3, 9, 0, 23, 23
+; LE-NEXT:    rlwinm 3, 9, 0, 24, 24
 ; LE-NEXT:    mulld 3, 10, 3
 ; LE-NEXT:    xor 3, 4, 3
+; LE-NEXT:    rlwinm 4, 9, 0, 23, 23
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
 ; LE-NEXT:    rlwinm 4, 9, 0, 22, 22
 ; LE-NEXT:    mulld 4, 10, 4
 ; LE-NEXT:    xor 3, 3, 4
@@ -9590,6 +9598,9 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    rldicl 4, 4, 62, 1
 ; LE-NEXT:    mulld 4, 10, 4
 ; LE-NEXT:    xor 3, 3, 4
+; LE-NEXT:    rldicr 4, 9, 0, 0
+; LE-NEXT:    mulld 4, 10, 4
+; LE-NEXT:    xor 3, 3, 4
 ; LE-NEXT:    sldi 4, 3, 1
 ; LE-NEXT:    rldicl 3, 3, 63, 1
 ; LE-NEXT:    and 4, 4, 7
@@ -9614,10 +9625,10 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; LE-NEXT:    rlwimi 4, 3, 8, 24, 31
 ; LE-NEXT:    sldi 3, 4, 32
 ; LE-NEXT:    or 3, 3, 5
-; LE-NEXT:    rldicl 3, 3, 63, 1
 ; LE-NEXT:    mtfprd 1, 3
-; LE-NEXT:    xxmrghd 34, 1, 0
-; LE-NEXT:    addi 1, 1, 736
+; LE-NEXT:    xxmrghd 35, 1, 0
+; LE-NEXT:    vsrd 2, 3, 2
+; LE-NEXT:    addi 1, 1, 752
 ; LE-NEXT:    blr
   %a.ext = zext <2 x i64> %a to <2 x i128>
   %b.ext = zext <2 x i64> %b to <2 x i128>
diff --git a/llvm/test/CodeGen/X86/clmul-vector.ll b/llvm/test/CodeGen/X86/clmul-vector.ll
index 8f26f84c01883..8ca41b57072ed 100644
--- a/llvm/test/CodeGen/X86/clmul-vector.ll
+++ b/llvm/test/CodeGen/X86/clmul-vector.ll
@@ -434,97 +434,78 @@ define <8 x i16> @clmul_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX2-LABEL: clmul_v8i16:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
-; AVX2-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpxor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT:    vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm0, %xmm3, %xmm0
-; AVX2-NEXT:    vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vpclmulqdq $0, %xmm2, %xmm3, %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
+; AVX2-NEXT:    vpclmulqdq $0, %xmm5, %xmm6, %xmm5
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX2-NEXT:    vpclmulqdq $17, %xmm2, %xmm3, %xmm5
+; AVX2-NEXT:    vmovq %xmm5, %rax
+; AVX2-NEXT:    vpinsrd $2, %eax, %xmm4, %xmm4
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX2-NEXT:    vpclmulqdq $0, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT:    vmovq %xmm2, %rax
+; AVX2-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm2
+; AVX2-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
+; AVX2-NEXT:    vpclmulqdq $0, %xmm4, %xmm5, %xmm4
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT:    vpclmulqdq $17, %xmm1, %xmm0, %xmm4
+; AVX2-NEXT:    vmovq %xmm4, %rax
+; AVX2-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: clmul_v8i16:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpxor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm4
-; AVX512-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm4 = xmm4 ^ xmm2 ^ xmm3
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 ^ xmm4 ^ xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm4
-; AVX512-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm4 = xmm4 ^ xmm3 ^ xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 ^ xmm4 ^ xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm4
-; AVX512-NEXT:    vpmullw %xmm4, %xmm0, %xmm4
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm4 = xmm4 ^ xmm3 ^ xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT:    vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm3 = xmm3 ^ xmm4 ^ xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT:    vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm0 ^ xmm3 ^ xmm2
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512-NEXT:    vpclmulqdq $0, %xmm2, %xmm3, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
+; AVX512-NEXT:    vpclmulqdq $0, %xmm5, %xmm6, %xmm5
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX512-NEXT:    vpclmulqdq $17, %xmm2, %xmm3, %xmm5
+; AVX512-NEXT:    vmovq %xmm5, %rax
+; AVX512-NEXT:    vpinsrd $2, %eax, %xmm4, %xmm4
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX512-NEXT:    vpclmulqdq $0, %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    vpinsrd $3, %eax, %xmm4, %xmm2
+; AVX512-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
+; AVX512-NEXT:    vpclmulqdq $0, %xmm4, %xmm5, %xmm4
+; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT:    vpclmulqdq $17, %xmm1, %xmm0, %xmm4
+; AVX512-NEXT:    vmovq %xmm4, %rax
+; AVX512-NEXT:    vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT:    vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX512-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %res = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b)
   ret <8 x i16> %res



More information about the llvm-commits mailing list