[llvm] [SelectionDAG] Use Karatsuba decomposition to expand vector CLMUL via narrower legal types (PR #184468)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 4 02:14:59 PST 2026
https://github.com/AbdallahRashed updated https://github.com/llvm/llvm-project/pull/184468
>From f12211ab888801f79070cb2fb842b25480443860 Mon Sep 17 00:00:00 2001
From: AbdallahRashed <abdallah.mrashed at gmail.com>
Date: Sat, 28 Feb 2026 22:57:00 +0100
Subject: [PATCH] [SelectionDAG] Use Karatsuba decomposition to expand vector
CLMUL via narrower legal types
Reuse the ExpandIntRes_CLMUL Karatsuba identity to expand vector
CLMUL/CLMULR/CLMULH on wider element types (vXi16, vXi32, vXi64) by
decomposing into half-element-width operations that eventually reach a
legal CLMUL type.
Three generic strategies in expandCLMUL:
1. Karatsuba: halve element width (e.g. v8i16 -> v8i8 on AArch64)
2. Element widen: zext to wider type if CLMUL is legal there (e.g. x86)
3. Count widen: pad with undef to double element count (e.g. v4i16 -> v8i16)
A helper canNarrowCLMULToLegal() guides strategy selection and prevents
circular expansion in the CLMULH bitreverse path.
Also add Custom BITREVERSE lowering for v4i16/v8i16 on AArch64 using
REV16+RBIT, which the CLMULH expansion relies on.
Fixes #183768
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 166 +-
.../Target/AArch64/AArch64ISelLowering.cpp | 16 +
llvm/test/CodeGen/AArch64/clmul-fixed.ll | 2533 ++++++-----------
llvm/test/CodeGen/PowerPC/clmul-vector.ll | 433 +--
llvm/test/CodeGen/X86/clmul-vector.ll | 155 +-
5 files changed, 1324 insertions(+), 1979 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index cc719b1e67f53..ce096cf03293c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8456,6 +8456,53 @@ SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
return DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal);
}
+/// Check if CLMUL on VT can eventually reach a type with legal CLMUL through
+/// a chain of Karatsuba decompositions (halving element width) and/or vector
+/// widening (doubling element count). This guides expansion strategy selection:
+/// if true, the Karatsuba/widening path produces better code than bit-by-bit.
+///
+/// KaratsubaDepth tracks halving steps only (each creates ~4x more operations).
+/// Widening steps are cheap (O(1) pad/extract) and don't count.
+/// Limiting halvings to 2 prevents exponential blowup:
+/// 1 halving: ~4 sub-CLMULs (good, e.g. v8i16 -> v8i8)
+/// 2 halvings: ~16 sub-CLMULs (acceptable, e.g. v4i32 -> v4i16 -> v8i8)
+/// 3 halvings: ~64 sub-CLMULs (worse than bit-by-bit expansion)
+static bool canNarrowCLMULToLegal(const TargetLowering &TLI, LLVMContext &Ctx,
+ EVT VT, unsigned KaratsubaDepth = 0,
+ unsigned TotalDepth = 0) {
+ if (KaratsubaDepth > 2 || TotalDepth > 8 || !VT.isVector() ||
+ VT.isScalableVector())
+ return false;
+ if (TLI.isOperationLegalOrCustom(ISD::CLMUL, VT))
+ return true;
+ if (!TLI.isTypeLegal(VT))
+ return false;
+
+ unsigned BW = VT.getScalarSizeInBits();
+
+ // Karatsuba: halve element width, same element count.
+ // This is the expensive step — each halving creates ~4x more operations.
+ if (BW >= 16) {
+ EVT HalfEltVT = EVT::getIntegerVT(Ctx, BW / 2);
+ EVT HalfVT = EVT::getVectorVT(Ctx, HalfEltVT, VT.getVectorElementCount());
+ if (TLI.isTypeLegal(HalfVT) &&
+ canNarrowCLMULToLegal(TLI, Ctx, HalfVT, KaratsubaDepth + 1,
+ TotalDepth + 1))
+ return true;
+ }
+
+ // Widen: double element count (fixed-width vectors only).
+ // This is cheap — just INSERT_SUBVECTOR + EXTRACT_SUBVECTOR.
+ if (auto EC = VT.getVectorElementCount(); EC.isFixed()) {
+ EVT WideVT = EVT::getVectorVT(Ctx, VT.getVectorElementType(), EC * 2);
+ if (TLI.isTypeLegal(WideVT) &&
+ canNarrowCLMULToLegal(TLI, Ctx, WideVT, KaratsubaDepth, TotalDepth + 1))
+ return true;
+ }
+
+ return false;
+}
+
SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
SDLoc DL(Node);
EVT VT = Node->getValueType(0);
@@ -8463,19 +8510,104 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
SDValue Y = Node->getOperand(1);
unsigned BW = VT.getScalarSizeInBits();
unsigned Opcode = Node->getOpcode();
-
- // Scalarize if the vector multiplication is unlikely to work.
- if (VT.isVector() && !isOperationLegalOrCustom(ISD::MUL, VT))
- return DAG.UnrollVectorOp(Node);
+ LLVMContext &Ctx = *DAG.getContext();
switch (Opcode) {
case ISD::CLMUL: {
+ // For vector types, try decomposition strategies that leverage legal
+ // CLMUL on narrower or wider element types, avoiding the expensive
+ // bit-by-bit expansion.
+ if (VT.isVector()) {
+ // Strategy 1: Karatsuba decomposition to half-element-width CLMUL.
+ // Applies ExpandIntRes_CLMUL's identity element-wise:
+ // CLMUL(X, Y) = (Hi << HalfBW) | Lo
+ // where:
+ // Lo = CLMUL(XLo, YLo)
+ // Hi = CLMULH(XLo, YLo) ^ CLMUL(XLo, YHi) ^ CLMUL(XHi, YLo)
+ unsigned HalfBW = BW / 2;
+ if (HalfBW >= 8) {
+ EVT HalfEltVT = EVT::getIntegerVT(Ctx, HalfBW);
+ EVT HalfVT =
+ EVT::getVectorVT(Ctx, HalfEltVT, VT.getVectorElementCount());
+ if (isTypeLegal(HalfVT) &&
+ canNarrowCLMULToLegal(*this, Ctx, HalfVT,
+ /*KaratsubaDepth=*/1)) {
+ SDValue ShAmt = DAG.getShiftAmountConstant(HalfBW, VT, DL);
+
+ // Extract low and high halves of each element.
+ SDValue XLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, X);
+ SDValue XHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT,
+ DAG.getNode(ISD::SRL, DL, VT, X, ShAmt));
+ SDValue YLo = DAG.getNode(ISD::TRUNCATE, DL, HalfVT, Y);
+ SDValue YHi = DAG.getNode(ISD::TRUNCATE, DL, HalfVT,
+ DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt));
+
+ // Lo = CLMUL(XLo, YLo)
+ SDValue Lo = DAG.getNode(ISD::CLMUL, DL, HalfVT, XLo, YLo);
+
+ // Hi = CLMULH(XLo, YLo) ^ CLMUL(XLo, YHi) ^ CLMUL(XHi, YLo)
+ SDValue LoH = DAG.getNode(ISD::CLMULH, DL, HalfVT, XLo, YLo);
+ SDValue Cross1 = DAG.getNode(ISD::CLMUL, DL, HalfVT, XLo, YHi);
+ SDValue Cross2 = DAG.getNode(ISD::CLMUL, DL, HalfVT, XHi, YLo);
+ SDValue Cross = DAG.getNode(ISD::XOR, DL, HalfVT, Cross1, Cross2);
+ SDValue Hi = DAG.getNode(ISD::XOR, DL, HalfVT, LoH, Cross);
+
+ // Reassemble: Result = ZExt(Lo) | (ZExt(Hi) << HalfBW)
+ SDValue LoExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Lo);
+ SDValue HiExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Hi);
+ SDValue HiShifted = DAG.getNode(ISD::SHL, DL, VT, HiExt, ShAmt);
+ return DAG.getNode(ISD::OR, DL, VT, LoExt, HiShifted);
+ }
+ }
+
+ // Strategy 2: Widen to double-element-width CLMUL.
+ // CLMUL(X, Y) = Trunc(CLMUL(ZExt(X), ZExt(Y)))
+ {
+ EVT ExtVT = VT.changeElementType(Ctx, EVT::getIntegerVT(Ctx, 2 * BW));
+ if (isTypeLegal(ExtVT) && isOperationLegalOrCustom(ISD::CLMUL, ExtVT) &&
+ isOperationLegalOrCustom(ISD::ZERO_EXTEND, ExtVT)) {
+ // If CLMUL on ExtVT is Custom (not Legal), the target may
+ // scalarize it, costing O(NumElements) scalar ops. The bit-by-bit
+ // fallback costs O(BW) vectorized iterations. Only widen when
+ // element count is small enough that scalarization is cheaper.
+ unsigned NumElts = VT.getVectorMinNumElements();
+ if (isOperationLegal(ISD::CLMUL, ExtVT) || NumElts < BW) {
+ SDValue XExt = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, X);
+ SDValue YExt = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Y);
+ SDValue Mul = DAG.getNode(ISD::CLMUL, DL, ExtVT, XExt, YExt);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Mul);
+ }
+ }
+ }
+
+ // Strategy 3: Widen element count (pad with undef, do CLMUL on wider
+ // vector, extract lower result). CLMUL is element-wise, so upper
+ // (undef) lanes don't affect the lower results.
+ // e.g. v4i16 → pad to v8i16 → Karatsuba to v8i8 PMUL → extract v4i16.
+ if (auto EC = VT.getVectorElementCount(); EC.isFixed()) {
+ EVT WideVT = EVT::getVectorVT(Ctx, VT.getVectorElementType(), EC * 2);
+ if (isTypeLegal(WideVT) && canNarrowCLMULToLegal(*this, Ctx, WideVT)) {
+ SDValue Undef = DAG.getUNDEF(WideVT);
+ SDValue XWide = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, Undef,
+ X, DAG.getVectorIdxConstant(0, DL));
+ SDValue YWide = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, Undef,
+ Y, DAG.getVectorIdxConstant(0, DL));
+ SDValue WideRes = DAG.getNode(ISD::CLMUL, DL, WideVT, XWide, YWide);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WideRes,
+ DAG.getVectorIdxConstant(0, DL));
+ }
+ }
+ }
+
+ // Scalarize if the vector multiplication is unlikely to work.
+ if (VT.isVector() && !isOperationLegalOrCustom(ISD::MUL, VT))
+ return DAG.UnrollVectorOp(Node);
+
// NOTE: If you change this expansion, please update the cost model
// calculation in BasicTTIImpl::getTypeBasedIntrinsicInstrCost for
// Intrinsic::clmul.
- EVT SetCCVT =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), Ctx, VT);
SDValue Res = DAG.getConstant(0, DL, VT);
for (unsigned I = 0; I < BW; ++I) {
@@ -8488,8 +8620,7 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
// instructions.
SDValue Part;
if (!hasBitTest(Y, ShiftAmt) &&
- isOperationLegalOrCustom(
- ISD::MUL, getTypeToTransformTo(*DAG.getContext(), VT))) {
+ isOperationLegalOrCustom(ISD::MUL, getTypeToTransformTo(Ctx, VT))) {
Part = DAG.getNode(ISD::MUL, DL, VT, X, YMasked);
} else {
// Canonical bit test: (Y & (1 << I)) != 0
@@ -8516,17 +8647,20 @@ SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
}
[[fallthrough]];
case ISD::CLMULH: {
- EVT ExtVT = VT.changeElementType(
- *DAG.getContext(), EVT::getIntegerVT(*DAG.getContext(), 2 * BW));
- // For example, ExtVT = i64 based operations aren't legal on a 32-bit
- // target; use bitreverse-based lowering in this case.
- // Also prefer bitreverse-based lowering when CLMUL is legal on VT but
- // not on ExtVT, to avoid expanding CLMUL on the wider type (e.g. v8i8
- // on AArch64 where CLMUL v8i8 is legal via PMUL but CLMUL v8i16 is not).
+ EVT ExtVT = VT.changeElementType(Ctx, EVT::getIntegerVT(Ctx, 2 * BW));
+ // Use bitreverse-based lowering (CLMULR/H = rev(CLMUL(rev,rev)) >> S)
+ // when any of these hold:
+ // (a) ZERO_EXTEND to ExtVT or SRL on ExtVT isn't legal.
+ // (b) CLMUL is legal on VT but not on ExtVT (e.g. v8i8 on AArch64).
+ // (c) CLMUL on VT can be efficiently expanded via Karatsuba/widening
+ // to reach legal CLMUL. The bitreverse path creates CLMUL(VT) which
+ // will be expanded efficiently. The widening path would create
+ // CLMUL(ExtVT) → Karatsuba → CLMULH(VT), causing a cycle.
if (!isOperationLegalOrCustom(ISD::ZERO_EXTEND, ExtVT) ||
!isOperationLegalOrCustom(ISD::SRL, ExtVT) ||
(!isOperationLegalOrCustom(ISD::CLMUL, ExtVT) &&
- isOperationLegalOrCustom(ISD::CLMUL, VT))) {
+ isOperationLegalOrCustom(ISD::CLMUL, VT)) ||
+ canNarrowCLMULToLegal(*this, Ctx, VT)) {
SDValue XRev = DAG.getNode(ISD::BITREVERSE, DL, VT, X);
SDValue YRev = DAG.getNode(ISD::BITREVERSE, DL, VT, Y);
SDValue ClMul = DAG.getNode(ISD::CLMUL, DL, VT, XRev, YRev);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2cd78493d2c23..b7d186d83c92e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1329,6 +1329,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::CTLS, VT, Legal);
setOperationAction(ISD::BITREVERSE, MVT::v8i8, Legal);
setOperationAction(ISD::BITREVERSE, MVT::v16i8, Legal);
+ setOperationAction(ISD::BITREVERSE, MVT::v4i16, Custom);
+ setOperationAction(ISD::BITREVERSE, MVT::v8i16, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v2i32, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v4i32, Custom);
setOperationAction(ISD::BITREVERSE, MVT::v1i64, Custom);
@@ -11960,6 +11962,20 @@ SDValue AArch64TargetLowering::LowerBitreverse(SDValue Op,
default:
llvm_unreachable("Invalid type for bitreverse!");
+ case MVT::v4i16: {
+ VST = MVT::v8i8;
+ REVB = DAG.getNode(AArch64ISD::REV16, DL, VST, Op.getOperand(0));
+
+ break;
+ }
+
+ case MVT::v8i16: {
+ VST = MVT::v16i8;
+ REVB = DAG.getNode(AArch64ISD::REV16, DL, VST, Op.getOperand(0));
+
+ break;
+ }
+
case MVT::v2i32: {
VST = MVT::v8i8;
REVB = DAG.getNode(AArch64ISD::REV32, DL, VST, Op.getOperand(0));
diff --git a/llvm/test/CodeGen/AArch64/clmul-fixed.ll b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
index 23692dc456fc2..46ad7d9bbc295 100644
--- a/llvm/test/CodeGen/AArch64/clmul-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/clmul-fixed.ll
@@ -23,69 +23,23 @@ define <8 x i8> @clmul_v8i8_neon(<8 x i8> %x, <8 x i8> %y) {
define <8 x i16> @clmul_v8i16_neon(<8 x i16> %x, <8 x i16> %y) {
; CHECK-LABEL: clmul_v8i16_neon:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.8h, #2
-; CHECK-NEXT: movi v3.8h, #1
-; CHECK-NEXT: movi v4.8h, #4
-; CHECK-NEXT: movi v5.8h, #8
-; CHECK-NEXT: movi v6.8h, #16
-; CHECK-NEXT: movi v7.8h, #32
-; CHECK-NEXT: movi v16.8h, #128
-; CHECK-NEXT: movi v17.8h, #1, lsl #8
-; CHECK-NEXT: movi v18.8h, #8, lsl #8
-; CHECK-NEXT: movi v19.8h, #16, lsl #8
-; CHECK-NEXT: movi v20.8h, #64
-; CHECK-NEXT: movi v21.8h, #2, lsl #8
-; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT: movi v22.8h, #32, lsl #8
-; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT: mul v2.8h, v0.8h, v2.8h
-; CHECK-NEXT: mul v3.8h, v0.8h, v3.8h
-; CHECK-NEXT: mul v4.8h, v0.8h, v4.8h
-; CHECK-NEXT: mul v5.8h, v0.8h, v5.8h
-; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT: movi v23.8h, #4, lsl #8
-; CHECK-NEXT: movi v24.8h, #64, lsl #8
-; CHECK-NEXT: mul v6.8h, v0.8h, v6.8h
-; CHECK-NEXT: mul v7.8h, v0.8h, v7.8h
-; CHECK-NEXT: mul v16.8h, v0.8h, v16.8h
-; CHECK-NEXT: mul v17.8h, v0.8h, v17.8h
-; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT: mul v18.8h, v0.8h, v18.8h
-; CHECK-NEXT: mul v19.8h, v0.8h, v19.8h
-; CHECK-NEXT: and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: mul v4.8h, v0.8h, v20.8h
-; CHECK-NEXT: movi v20.8h, #128, lsl #8
-; CHECK-NEXT: mul v5.8h, v0.8h, v21.8h
-; CHECK-NEXT: and v21.16b, v1.16b, v23.16b
-; CHECK-NEXT: and v23.16b, v1.16b, v24.16b
-; CHECK-NEXT: mul v22.8h, v0.8h, v22.8h
-; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT: eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT: eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v20.16b
-; CHECK-NEXT: mul v3.8h, v0.8h, v21.8h
-; CHECK-NEXT: mul v17.8h, v0.8h, v23.8h
-; CHECK-NEXT: eor v4.16b, v6.16b, v4.16b
-; CHECK-NEXT: eor v5.16b, v7.16b, v5.16b
-; CHECK-NEXT: eor v6.16b, v16.16b, v22.16b
-; CHECK-NEXT: mul v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: eor v1.16b, v2.16b, v4.16b
-; CHECK-NEXT: eor v2.16b, v5.16b, v3.16b
-; CHECK-NEXT: eor v3.16b, v6.16b, v17.16b
-; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: xtn v2.8b, v1.8h
+; CHECK-NEXT: xtn v3.8b, v0.8h
+; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-NEXT: rbit v4.8b, v2.8b
+; CHECK-NEXT: rbit v5.8b, v3.8b
+; CHECK-NEXT: pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT: pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: ushr v1.8b, v4.8b, #1
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: ushll v1.8h, v2.8b, #0
+; CHECK-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: ret
%a = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %x, <8 x i16> %y)
ret <8 x i16> %a
@@ -94,69 +48,26 @@ define <8 x i16> @clmul_v8i16_neon(<8 x i16> %x, <8 x i16> %y) {
define <4 x i16> @clmul_v4i16_neon(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: clmul_v4i16_neon:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.4h, #2
-; CHECK-NEXT: movi v3.4h, #1
-; CHECK-NEXT: movi v4.4h, #4
-; CHECK-NEXT: movi v5.4h, #8
-; CHECK-NEXT: movi v6.4h, #16
-; CHECK-NEXT: movi v7.4h, #32
-; CHECK-NEXT: movi v16.4h, #128
-; CHECK-NEXT: movi v17.4h, #1, lsl #8
-; CHECK-NEXT: movi v18.4h, #8, lsl #8
-; CHECK-NEXT: movi v19.4h, #16, lsl #8
-; CHECK-NEXT: movi v20.4h, #64
-; CHECK-NEXT: movi v21.4h, #2, lsl #8
-; CHECK-NEXT: and v2.8b, v1.8b, v2.8b
-; CHECK-NEXT: and v3.8b, v1.8b, v3.8b
-; CHECK-NEXT: and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT: and v5.8b, v1.8b, v5.8b
-; CHECK-NEXT: movi v22.4h, #32, lsl #8
-; CHECK-NEXT: and v6.8b, v1.8b, v6.8b
-; CHECK-NEXT: and v7.8b, v1.8b, v7.8b
-; CHECK-NEXT: and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT: and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT: and v18.8b, v1.8b, v18.8b
-; CHECK-NEXT: and v19.8b, v1.8b, v19.8b
-; CHECK-NEXT: mul v2.4h, v0.4h, v2.4h
-; CHECK-NEXT: mul v3.4h, v0.4h, v3.4h
-; CHECK-NEXT: mul v4.4h, v0.4h, v4.4h
-; CHECK-NEXT: mul v5.4h, v0.4h, v5.4h
-; CHECK-NEXT: and v20.8b, v1.8b, v20.8b
-; CHECK-NEXT: movi v23.4h, #4, lsl #8
-; CHECK-NEXT: movi v24.4h, #64, lsl #8
-; CHECK-NEXT: mul v6.4h, v0.4h, v6.4h
-; CHECK-NEXT: mul v7.4h, v0.4h, v7.4h
-; CHECK-NEXT: mul v16.4h, v0.4h, v16.4h
-; CHECK-NEXT: mul v17.4h, v0.4h, v17.4h
-; CHECK-NEXT: and v21.8b, v1.8b, v21.8b
-; CHECK-NEXT: mul v18.4h, v0.4h, v18.4h
-; CHECK-NEXT: mul v19.4h, v0.4h, v19.4h
-; CHECK-NEXT: and v22.8b, v1.8b, v22.8b
-; CHECK-NEXT: eor v2.8b, v3.8b, v2.8b
-; CHECK-NEXT: eor v3.8b, v4.8b, v5.8b
-; CHECK-NEXT: mul v4.4h, v0.4h, v20.4h
-; CHECK-NEXT: movi v20.4h, #128, lsl #8
-; CHECK-NEXT: mul v5.4h, v0.4h, v21.4h
-; CHECK-NEXT: and v21.8b, v1.8b, v23.8b
-; CHECK-NEXT: and v23.8b, v1.8b, v24.8b
-; CHECK-NEXT: mul v22.4h, v0.4h, v22.4h
-; CHECK-NEXT: eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT: eor v7.8b, v16.8b, v17.8b
-; CHECK-NEXT: eor v16.8b, v18.8b, v19.8b
-; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT: and v1.8b, v1.8b, v20.8b
-; CHECK-NEXT: mul v3.4h, v0.4h, v21.4h
-; CHECK-NEXT: mul v17.4h, v0.4h, v23.4h
-; CHECK-NEXT: eor v4.8b, v6.8b, v4.8b
-; CHECK-NEXT: eor v5.8b, v7.8b, v5.8b
-; CHECK-NEXT: eor v6.8b, v16.8b, v22.8b
-; CHECK-NEXT: mul v0.4h, v0.4h, v1.4h
-; CHECK-NEXT: eor v1.8b, v2.8b, v4.8b
-; CHECK-NEXT: eor v2.8b, v5.8b, v3.8b
-; CHECK-NEXT: eor v3.8b, v6.8b, v17.8b
-; CHECK-NEXT: eor v1.8b, v1.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v3.8b, v0.8b
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: xtn v2.8b, v1.8h
+; CHECK-NEXT: xtn v3.8b, v0.8h
+; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-NEXT: rbit v4.8b, v2.8b
+; CHECK-NEXT: rbit v5.8b, v3.8b
+; CHECK-NEXT: pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT: pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: pmul v4.8b, v5.8b, v4.8b
; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: ushr v1.8b, v4.8b, #1
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: ushll v1.8h, v2.8b, #0
+; CHECK-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret
%a = call <4 x i16> @llvm.clmul.v4i16(<4 x i16> %x, <4 x i16> %y)
ret <4 x i16> %a
@@ -165,269 +76,184 @@ define <4 x i16> @clmul_v4i16_neon(<4 x i16> %x, <4 x i16> %y) {
define <4 x i32> @clmul_v4i32_neon(<4 x i32> %x, <4 x i32> %y) {
; CHECK-LABEL: clmul_v4i32_neon:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.4s, #2
-; CHECK-NEXT: movi v3.4s, #1
-; CHECK-NEXT: movi v4.4s, #4
-; CHECK-NEXT: movi v5.4s, #8
-; CHECK-NEXT: movi v6.4s, #16
-; CHECK-NEXT: movi v7.4s, #32
-; CHECK-NEXT: movi v16.4s, #64
-; CHECK-NEXT: movi v17.4s, #128
-; CHECK-NEXT: movi v18.4s, #1, lsl #8
-; CHECK-NEXT: movi v19.4s, #2, lsl #8
-; CHECK-NEXT: movi v20.4s, #8, lsl #8
-; CHECK-NEXT: movi v21.4s, #128, lsl #16
-; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT: mul v2.4s, v0.4s, v2.4s
-; CHECK-NEXT: mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT: mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT: mul v5.4s, v0.4s, v5.4s
-; CHECK-NEXT: mul v6.4s, v0.4s, v6.4s
-; CHECK-NEXT: mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT: movi v22.4s, #8, lsl #16
-; CHECK-NEXT: movi v23.4s, #2, lsl #24
-; CHECK-NEXT: movi v25.4s, #4, lsl #24
-; CHECK-NEXT: movi v24.4s, #32, lsl #16
-; CHECK-NEXT: movi v26.4s, #8, lsl #24
-; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: movi v4.4s, #16, lsl #8
-; CHECK-NEXT: mul v5.4s, v0.4s, v16.4s
-; CHECK-NEXT: mul v16.4s, v0.4s, v17.4s
-; CHECK-NEXT: mul v17.4s, v0.4s, v18.4s
-; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT: and v7.16b, v1.16b, v19.16b
-; CHECK-NEXT: movi v19.4s, #32, lsl #8
-; CHECK-NEXT: and v18.16b, v1.16b, v20.16b
-; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: movi v20.4s, #64, lsl #8
-; CHECK-NEXT: mul v21.4s, v0.4s, v21.4s
-; CHECK-NEXT: and v3.16b, v1.16b, v4.16b
-; CHECK-NEXT: eor v5.16b, v6.16b, v5.16b
-; CHECK-NEXT: movi v4.4s, #1, lsl #16
-; CHECK-NEXT: eor v6.16b, v16.16b, v17.16b
-; CHECK-NEXT: movi v16.4s, #2, lsl #16
-; CHECK-NEXT: mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT: mul v18.4s, v0.4s, v18.4s
-; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT: movi v17.4s, #4, lsl #8
-; CHECK-NEXT: mul v3.4s, v0.4s, v3.4s
-; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT: and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v1.16b, v16.16b
-; CHECK-NEXT: movi v16.4s, #64, lsl #16
-; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT: mul v7.4s, v0.4s, v19.4s
-; CHECK-NEXT: movi v19.4s, #4, lsl #16
-; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT: eor v3.16b, v18.16b, v3.16b
-; CHECK-NEXT: and v18.16b, v1.16b, v20.16b
-; CHECK-NEXT: movi v20.4s, #1, lsl #24
-; CHECK-NEXT: mul v4.4s, v0.4s, v4.4s
-; CHECK-NEXT: mul v5.4s, v0.4s, v5.4s
-; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT: mul v17.4s, v0.4s, v17.4s
-; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT: and v7.16b, v1.16b, v19.16b
-; CHECK-NEXT: mul v18.4s, v0.4s, v18.4s
-; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT: movi v19.4s, #128, lsl #8
-; CHECK-NEXT: mul v16.4s, v0.4s, v16.4s
-; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT: mul v5.4s, v0.4s, v7.4s
-; CHECK-NEXT: and v7.16b, v1.16b, v22.16b
-; CHECK-NEXT: movi v22.4s, #16, lsl #16
-; CHECK-NEXT: mul v20.4s, v0.4s, v20.4s
-; CHECK-NEXT: eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT: eor v3.16b, v3.16b, v18.16b
-; CHECK-NEXT: and v17.16b, v1.16b, v19.16b
-; CHECK-NEXT: mul v18.4s, v0.4s, v23.4s
-; CHECK-NEXT: and v19.16b, v1.16b, v25.16b
-; CHECK-NEXT: eor v16.16b, v16.16b, v21.16b
-; CHECK-NEXT: and v21.16b, v1.16b, v24.16b
-; CHECK-NEXT: movi v23.4s, #32, lsl #24
-; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT: mul v5.4s, v0.4s, v7.4s
-; CHECK-NEXT: and v7.16b, v1.16b, v22.16b
-; CHECK-NEXT: movi v22.4s, #16, lsl #24
-; CHECK-NEXT: movi v24.4s, #64, lsl #24
-; CHECK-NEXT: mul v17.4s, v0.4s, v17.4s
-; CHECK-NEXT: eor v16.16b, v16.16b, v20.16b
-; CHECK-NEXT: and v20.16b, v1.16b, v26.16b
-; CHECK-NEXT: mul v19.4s, v0.4s, v19.4s
-; CHECK-NEXT: mul v7.4s, v0.4s, v7.4s
-; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b
-; CHECK-NEXT: mul v6.4s, v0.4s, v21.4s
-; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v21.16b, v1.16b, v23.16b
-; CHECK-NEXT: eor v5.16b, v16.16b, v18.16b
-; CHECK-NEXT: movi v16.4s, #128, lsl #24
-; CHECK-NEXT: mul v18.4s, v0.4s, v20.4s
-; CHECK-NEXT: and v20.16b, v1.16b, v22.16b
-; CHECK-NEXT: and v22.16b, v1.16b, v24.16b
-; CHECK-NEXT: eor v3.16b, v3.16b, v17.16b
-; CHECK-NEXT: eor v4.16b, v4.16b, v7.16b
-; CHECK-NEXT: eor v5.16b, v5.16b, v19.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v16.16b
-; CHECK-NEXT: mul v7.4s, v0.4s, v20.4s
-; CHECK-NEXT: mul v16.4s, v0.4s, v21.4s
-; CHECK-NEXT: mul v17.4s, v0.4s, v22.4s
-; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v6.16b
-; CHECK-NEXT: eor v4.16b, v5.16b, v18.16b
-; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s
-; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v2.16b, v4.16b, v7.16b
-; CHECK-NEXT: eor v3.16b, v16.16b, v17.16b
-; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: xtn v2.4h, v1.4s
+; CHECK-NEXT: xtn v3.4h, v0.4s
+; CHECK-NEXT: shrn v16.4h, v0.4s, #16
+; CHECK-NEXT: shrn v17.4h, v1.4s, #16
+; CHECK-NEXT: xtn v20.8b, v16.8h
+; CHECK-NEXT: shrn v16.8b, v16.8h, #8
+; CHECK-NEXT: rev16 v4.8b, v2.8b
+; CHECK-NEXT: rev16 v5.8b, v3.8b
+; CHECK-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEXT: xtn v21.8b, v17.8h
+; CHECK-NEXT: xtn v1.8b, v3.8h
+; CHECK-NEXT: shrn v2.8b, v2.8h, #8
+; CHECK-NEXT: shrn v3.8b, v3.8h, #8
+; CHECK-NEXT: shrn v17.8b, v17.8h, #8
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: rbit v5.8b, v5.8b
+; CHECK-NEXT: rbit v22.8b, v0.8b
+; CHECK-NEXT: rbit v23.8b, v21.8b
+; CHECK-NEXT: rbit v24.8b, v1.8b
+; CHECK-NEXT: pmul v16.8b, v16.8b, v0.8b
+; CHECK-NEXT: pmul v25.8b, v20.8b, v2.8b
+; CHECK-NEXT: pmul v17.8b, v1.8b, v17.8b
+; CHECK-NEXT: pmul v2.8b, v1.8b, v2.8b
+; CHECK-NEXT: xtn v6.8b, v4.8h
+; CHECK-NEXT: xtn v7.8b, v5.8h
+; CHECK-NEXT: shrn v5.8b, v5.8h, #8
+; CHECK-NEXT: shrn v4.8b, v4.8h, #8
+; CHECK-NEXT: pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEXT: rbit v18.8b, v6.8b
+; CHECK-NEXT: rbit v19.8b, v7.8b
+; CHECK-NEXT: pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEXT: pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEXT: pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEXT: rbit v7.8b, v23.8b
+; CHECK-NEXT: pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEXT: rbit v19.8b, v20.8b
+; CHECK-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT: ushll v6.8h, v6.8b, #0
+; CHECK-NEXT: ushr v7.8b, v7.8b, #1
+; CHECK-NEXT: rbit v18.8b, v18.8b
+; CHECK-NEXT: pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEXT: ushr v5.8b, v18.8b, #1
+; CHECK-NEXT: rbit v18.8b, v19.8b
+; CHECK-NEXT: pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEXT: pmul v3.8b, v3.8b, v0.8b
+; CHECK-NEXT: eor v4.8b, v5.8b, v4.8b
+; CHECK-NEXT: eor v5.8b, v25.8b, v16.8b
+; CHECK-NEXT: eor v16.8b, v17.8b, v19.8b
+; CHECK-NEXT: pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEXT: ushr v18.8b, v18.8b, #1
+; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b
+; CHECK-NEXT: shll v4.8h, v4.8b, #8
+; CHECK-NEXT: eor v5.8b, v18.8b, v5.8b
+; CHECK-NEXT: pmul v18.8b, v20.8b, v0.8b
+; CHECK-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEXT: pmul v16.8b, v1.8b, v21.8b
+; CHECK-NEXT: pmul v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: orr v4.16b, v6.16b, v4.16b
+; CHECK-NEXT: rbit v6.8b, v17.8b
+; CHECK-NEXT: shll v5.8h, v5.8b, #8
+; CHECK-NEXT: shll v7.8h, v7.8b, #8
+; CHECK-NEXT: ushll v17.8h, v18.8b, #0
+; CHECK-NEXT: rev16 v4.8b, v4.8b
+; CHECK-NEXT: ushll v16.8h, v16.8b, #0
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushr v3.8b, v6.8b, #1
+; CHECK-NEXT: orr v5.16b, v17.16b, v5.16b
+; CHECK-NEXT: orr v6.16b, v16.16b, v7.16b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: eor v1.8b, v3.8b, v2.8b
+; CHECK-NEXT: eor v2.8b, v6.8b, v5.8b
+; CHECK-NEXT: shll v1.8h, v1.8b, #8
+; CHECK-NEXT: ushr v3.4h, v4.4h, #1
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: shll v1.4s, v2.4h, #16
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%a = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %x, <4 x i32> %y)
ret <4 x i32> %a
}
define <2 x i32> @clmul_v2i32_neon(<2 x i32> %x, <2 x i32> %y) {
-; CHECK-LABEL: clmul_v2i32_neon:
-; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.2s, #2
-; CHECK-NEXT: movi v3.2s, #1
-; CHECK-NEXT: movi v4.2s, #4
-; CHECK-NEXT: movi v5.2s, #8
-; CHECK-NEXT: movi v6.2s, #16
-; CHECK-NEXT: movi v7.2s, #32
-; CHECK-NEXT: movi v16.2s, #64
-; CHECK-NEXT: movi v17.2s, #128
-; CHECK-NEXT: movi v18.2s, #1, lsl #8
-; CHECK-NEXT: movi v19.2s, #2, lsl #8
-; CHECK-NEXT: movi v20.2s, #8, lsl #8
-; CHECK-NEXT: movi v21.2s, #128, lsl #16
-; CHECK-NEXT: and v2.8b, v1.8b, v2.8b
-; CHECK-NEXT: and v3.8b, v1.8b, v3.8b
-; CHECK-NEXT: and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT: and v5.8b, v1.8b, v5.8b
-; CHECK-NEXT: and v6.8b, v1.8b, v6.8b
-; CHECK-NEXT: and v7.8b, v1.8b, v7.8b
-; CHECK-NEXT: and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT: and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT: and v18.8b, v1.8b, v18.8b
-; CHECK-NEXT: mul v2.2s, v0.2s, v2.2s
-; CHECK-NEXT: mul v3.2s, v0.2s, v3.2s
-; CHECK-NEXT: mul v4.2s, v0.2s, v4.2s
-; CHECK-NEXT: mul v5.2s, v0.2s, v5.2s
-; CHECK-NEXT: mul v6.2s, v0.2s, v6.2s
-; CHECK-NEXT: mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT: and v21.8b, v1.8b, v21.8b
-; CHECK-NEXT: movi v22.2s, #8, lsl #16
-; CHECK-NEXT: movi v23.2s, #2, lsl #24
-; CHECK-NEXT: movi v25.2s, #4, lsl #24
-; CHECK-NEXT: movi v24.2s, #32, lsl #16
-; CHECK-NEXT: movi v26.2s, #8, lsl #24
-; CHECK-NEXT: eor v2.8b, v3.8b, v2.8b
-; CHECK-NEXT: eor v3.8b, v4.8b, v5.8b
-; CHECK-NEXT: movi v4.2s, #16, lsl #8
-; CHECK-NEXT: mul v5.2s, v0.2s, v16.2s
-; CHECK-NEXT: mul v16.2s, v0.2s, v17.2s
-; CHECK-NEXT: mul v17.2s, v0.2s, v18.2s
-; CHECK-NEXT: eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT: and v7.8b, v1.8b, v19.8b
-; CHECK-NEXT: movi v19.2s, #32, lsl #8
-; CHECK-NEXT: and v18.8b, v1.8b, v20.8b
-; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT: movi v20.2s, #64, lsl #8
-; CHECK-NEXT: mul v21.2s, v0.2s, v21.2s
-; CHECK-NEXT: and v3.8b, v1.8b, v4.8b
-; CHECK-NEXT: eor v5.8b, v6.8b, v5.8b
-; CHECK-NEXT: movi v4.2s, #1, lsl #16
-; CHECK-NEXT: eor v6.8b, v16.8b, v17.8b
-; CHECK-NEXT: movi v16.2s, #2, lsl #16
-; CHECK-NEXT: mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT: mul v18.2s, v0.2s, v18.2s
-; CHECK-NEXT: and v19.8b, v1.8b, v19.8b
-; CHECK-NEXT: movi v17.2s, #4, lsl #8
-; CHECK-NEXT: mul v3.2s, v0.2s, v3.2s
-; CHECK-NEXT: eor v2.8b, v2.8b, v5.8b
-; CHECK-NEXT: and v23.8b, v1.8b, v23.8b
-; CHECK-NEXT: and v4.8b, v1.8b, v4.8b
-; CHECK-NEXT: and v5.8b, v1.8b, v16.8b
-; CHECK-NEXT: movi v16.2s, #64, lsl #16
-; CHECK-NEXT: eor v6.8b, v6.8b, v7.8b
-; CHECK-NEXT: mul v7.2s, v0.2s, v19.2s
-; CHECK-NEXT: movi v19.2s, #4, lsl #16
-; CHECK-NEXT: and v17.8b, v1.8b, v17.8b
-; CHECK-NEXT: eor v3.8b, v18.8b, v3.8b
-; CHECK-NEXT: and v18.8b, v1.8b, v20.8b
-; CHECK-NEXT: movi v20.2s, #1, lsl #24
-; CHECK-NEXT: mul v4.2s, v0.2s, v4.2s
-; CHECK-NEXT: mul v5.2s, v0.2s, v5.2s
-; CHECK-NEXT: and v16.8b, v1.8b, v16.8b
-; CHECK-NEXT: mul v17.2s, v0.2s, v17.2s
-; CHECK-NEXT: eor v3.8b, v3.8b, v7.8b
-; CHECK-NEXT: and v7.8b, v1.8b, v19.8b
-; CHECK-NEXT: mul v18.2s, v0.2s, v18.2s
-; CHECK-NEXT: and v20.8b, v1.8b, v20.8b
-; CHECK-NEXT: movi v19.2s, #128, lsl #8
-; CHECK-NEXT: mul v16.2s, v0.2s, v16.2s
-; CHECK-NEXT: eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT: mul v5.2s, v0.2s, v7.2s
-; CHECK-NEXT: and v7.8b, v1.8b, v22.8b
-; CHECK-NEXT: movi v22.2s, #16, lsl #16
-; CHECK-NEXT: mul v20.2s, v0.2s, v20.2s
-; CHECK-NEXT: eor v6.8b, v6.8b, v17.8b
-; CHECK-NEXT: eor v3.8b, v3.8b, v18.8b
-; CHECK-NEXT: and v17.8b, v1.8b, v19.8b
-; CHECK-NEXT: mul v18.2s, v0.2s, v23.2s
-; CHECK-NEXT: and v19.8b, v1.8b, v25.8b
-; CHECK-NEXT: eor v16.8b, v16.8b, v21.8b
-; CHECK-NEXT: and v21.8b, v1.8b, v24.8b
-; CHECK-NEXT: movi v23.2s, #32, lsl #24
-; CHECK-NEXT: eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT: mul v5.2s, v0.2s, v7.2s
-; CHECK-NEXT: and v7.8b, v1.8b, v22.8b
-; CHECK-NEXT: movi v22.2s, #16, lsl #24
-; CHECK-NEXT: movi v24.2s, #64, lsl #24
-; CHECK-NEXT: mul v17.2s, v0.2s, v17.2s
-; CHECK-NEXT: eor v16.8b, v16.8b, v20.8b
-; CHECK-NEXT: and v20.8b, v1.8b, v26.8b
-; CHECK-NEXT: mul v19.2s, v0.2s, v19.2s
-; CHECK-NEXT: mul v7.2s, v0.2s, v7.2s
-; CHECK-NEXT: eor v2.8b, v2.8b, v6.8b
-; CHECK-NEXT: mul v6.2s, v0.2s, v21.2s
-; CHECK-NEXT: eor v4.8b, v4.8b, v5.8b
-; CHECK-NEXT: and v21.8b, v1.8b, v23.8b
-; CHECK-NEXT: eor v5.8b, v16.8b, v18.8b
-; CHECK-NEXT: movi v16.2s, #128, lsl #24
-; CHECK-NEXT: mul v18.2s, v0.2s, v20.2s
-; CHECK-NEXT: and v20.8b, v1.8b, v22.8b
-; CHECK-NEXT: and v22.8b, v1.8b, v24.8b
-; CHECK-NEXT: eor v3.8b, v3.8b, v17.8b
-; CHECK-NEXT: eor v4.8b, v4.8b, v7.8b
-; CHECK-NEXT: eor v5.8b, v5.8b, v19.8b
-; CHECK-NEXT: and v1.8b, v1.8b, v16.8b
-; CHECK-NEXT: mul v7.2s, v0.2s, v20.2s
-; CHECK-NEXT: mul v16.2s, v0.2s, v21.2s
-; CHECK-NEXT: mul v17.2s, v0.2s, v22.2s
-; CHECK-NEXT: eor v2.8b, v2.8b, v3.8b
-; CHECK-NEXT: eor v3.8b, v4.8b, v6.8b
-; CHECK-NEXT: eor v4.8b, v5.8b, v18.8b
-; CHECK-NEXT: mul v0.2s, v0.2s, v1.2s
-; CHECK-NEXT: eor v1.8b, v2.8b, v3.8b
-; CHECK-NEXT: eor v2.8b, v4.8b, v7.8b
-; CHECK-NEXT: eor v3.8b, v16.8b, v17.8b
-; CHECK-NEXT: eor v1.8b, v1.8b, v2.8b
-; CHECK-NEXT: eor v0.8b, v3.8b, v0.8b
-; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
-; CHECK-NEXT: ret
+; CHECK-NEON-LABEL: clmul_v2i32_neon:
+; CHECK-NEON: // %bb.0:
+; CHECK-NEON-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEON-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEON-NEXT: xtn v2.4h, v1.4s
+; CHECK-NEON-NEXT: xtn v3.4h, v0.4s
+; CHECK-NEON-NEXT: shrn v16.4h, v0.4s, #16
+; CHECK-NEON-NEXT: shrn v17.4h, v1.4s, #16
+; CHECK-NEON-NEXT: xtn v20.8b, v16.8h
+; CHECK-NEON-NEXT: shrn v16.8b, v16.8h, #8
+; CHECK-NEON-NEXT: rev16 v4.8b, v2.8b
+; CHECK-NEON-NEXT: rev16 v5.8b, v3.8b
+; CHECK-NEON-NEXT: xtn v0.8b, v2.8h
+; CHECK-NEON-NEXT: xtn v21.8b, v17.8h
+; CHECK-NEON-NEXT: xtn v1.8b, v3.8h
+; CHECK-NEON-NEXT: shrn v2.8b, v2.8h, #8
+; CHECK-NEON-NEXT: shrn v3.8b, v3.8h, #8
+; CHECK-NEON-NEXT: shrn v17.8b, v17.8h, #8
+; CHECK-NEON-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT: rbit v5.8b, v5.8b
+; CHECK-NEON-NEXT: rbit v22.8b, v0.8b
+; CHECK-NEON-NEXT: rbit v23.8b, v21.8b
+; CHECK-NEON-NEXT: rbit v24.8b, v1.8b
+; CHECK-NEON-NEXT: pmul v16.8b, v16.8b, v0.8b
+; CHECK-NEON-NEXT: pmul v25.8b, v20.8b, v2.8b
+; CHECK-NEON-NEXT: pmul v17.8b, v1.8b, v17.8b
+; CHECK-NEON-NEXT: pmul v2.8b, v1.8b, v2.8b
+; CHECK-NEON-NEXT: xtn v6.8b, v4.8h
+; CHECK-NEON-NEXT: xtn v7.8b, v5.8h
+; CHECK-NEON-NEXT: shrn v5.8b, v5.8h, #8
+; CHECK-NEON-NEXT: shrn v4.8b, v4.8h, #8
+; CHECK-NEON-NEXT: pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEON-NEXT: rbit v18.8b, v6.8b
+; CHECK-NEON-NEXT: rbit v19.8b, v7.8b
+; CHECK-NEON-NEXT: pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEON-NEXT: pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT: rbit v7.8b, v23.8b
+; CHECK-NEON-NEXT: pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEON-NEXT: rbit v19.8b, v20.8b
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT: ushll v6.8h, v6.8b, #0
+; CHECK-NEON-NEXT: ushr v7.8b, v7.8b, #1
+; CHECK-NEON-NEXT: rbit v18.8b, v18.8b
+; CHECK-NEON-NEXT: pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEON-NEXT: ushr v5.8b, v18.8b, #1
+; CHECK-NEON-NEXT: rbit v18.8b, v19.8b
+; CHECK-NEON-NEXT: pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEON-NEXT: pmul v3.8b, v3.8b, v0.8b
+; CHECK-NEON-NEXT: eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT: eor v5.8b, v25.8b, v16.8b
+; CHECK-NEON-NEXT: eor v16.8b, v17.8b, v19.8b
+; CHECK-NEON-NEXT: pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEON-NEXT: ushr v18.8b, v18.8b, #1
+; CHECK-NEON-NEXT: eor v2.8b, v2.8b, v3.8b
+; CHECK-NEON-NEXT: shll v4.8h, v4.8b, #8
+; CHECK-NEON-NEXT: eor v5.8b, v18.8b, v5.8b
+; CHECK-NEON-NEXT: pmul v18.8b, v20.8b, v0.8b
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT: pmul v16.8b, v1.8b, v21.8b
+; CHECK-NEON-NEXT: pmul v0.8b, v1.8b, v0.8b
+; CHECK-NEON-NEXT: orr v4.16b, v6.16b, v4.16b
+; CHECK-NEON-NEXT: rbit v6.8b, v17.8b
+; CHECK-NEON-NEXT: shll v5.8h, v5.8b, #8
+; CHECK-NEON-NEXT: shll v7.8h, v7.8b, #8
+; CHECK-NEON-NEXT: ushll v17.8h, v18.8b, #0
+; CHECK-NEON-NEXT: rev16 v4.8b, v4.8b
+; CHECK-NEON-NEXT: ushll v16.8h, v16.8b, #0
+; CHECK-NEON-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEON-NEXT: ushr v3.8b, v6.8b, #1
+; CHECK-NEON-NEXT: orr v5.16b, v17.16b, v5.16b
+; CHECK-NEON-NEXT: orr v6.16b, v16.16b, v7.16b
+; CHECK-NEON-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT: eor v1.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT: eor v2.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT: shll v1.8h, v1.8b, #8
+; CHECK-NEON-NEXT: ushr v3.4h, v4.4h, #1
+; CHECK-NEON-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEON-NEXT: shll v1.4s, v2.4h, #16
+; CHECK-NEON-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-NEON-NEXT: ret
+;
+; CHECK-AES-LABEL: clmul_v2i32_neon:
+; CHECK-AES: // %bb.0:
+; CHECK-AES-NEXT: ushll v1.2d, v1.2s, #0
+; CHECK-AES-NEXT: ushll v0.2d, v0.2s, #0
+; CHECK-AES-NEXT: pmull2 v2.1q, v0.2d, v1.2d
+; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d
+; CHECK-AES-NEXT: mov v0.d[1], v2.d[0]
+; CHECK-AES-NEXT: xtn v0.2s, v0.2d
+; CHECK-AES-NEXT: ret
%a = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %x, <2 x i32> %y)
ret <2 x i32> %a
}
@@ -1730,45 +1556,15 @@ define <1 x i128> @clmul_v1i128_neon(<1 x i128> %x, <1 x i128> %y) {
define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
; CHECK-LABEL: clmul_v8i16_neon_zext:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: mov v2.16b, v1.16b
-; CHECK-NEXT: mov v3.16b, v1.16b
-; CHECK-NEXT: mov v4.16b, v1.16b
-; CHECK-NEXT: mov v5.16b, v1.16b
-; CHECK-NEXT: mov v6.16b, v1.16b
-; CHECK-NEXT: mov v7.16b, v1.16b
-; CHECK-NEXT: mov v16.16b, v1.16b
-; CHECK-NEXT: bic v1.8h, #127
-; CHECK-NEXT: bic v2.8h, #253
-; CHECK-NEXT: bic v3.8h, #254
-; CHECK-NEXT: bic v4.8h, #251
-; CHECK-NEXT: bic v5.8h, #247
-; CHECK-NEXT: bic v6.8h, #239
-; CHECK-NEXT: bic v7.8h, #223
-; CHECK-NEXT: bic v16.8h, #191
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: xtn v2.8b, v2.8h
-; CHECK-NEXT: xtn v3.8b, v3.8h
-; CHECK-NEXT: xtn v4.8b, v4.8h
-; CHECK-NEXT: xtn v5.8b, v5.8h
-; CHECK-NEXT: xtn v6.8b, v6.8h
-; CHECK-NEXT: xtn v7.8b, v7.8h
-; CHECK-NEXT: xtn v16.8b, v16.8h
-; CHECK-NEXT: umull v2.8h, v0.8b, v2.8b
-; CHECK-NEXT: umull v3.8h, v0.8b, v3.8b
-; CHECK-NEXT: umull v4.8h, v0.8b, v4.8b
-; CHECK-NEXT: umull v5.8h, v0.8b, v5.8b
-; CHECK-NEXT: umull v6.8h, v0.8b, v6.8b
-; CHECK-NEXT: umull v7.8h, v0.8b, v7.8b
-; CHECK-NEXT: umull v16.8h, v0.8b, v16.8b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: eor v4.16b, v6.16b, v7.16b
-; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v16.16b
-; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: rbit v2.8b, v1.8b
+; CHECK-NEXT: rbit v3.8b, v0.8b
+; CHECK-NEXT: pmul v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: rbit v2.8b, v2.8b
+; CHECK-NEXT: ushr v1.8b, v2.8b, #1
+; CHECK-NEXT: shll v1.8h, v1.8b, #8
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%zextx = zext <8 x i8> %x to <8 x i16>
%zexty = zext <8 x i8> %y to <8 x i16>
@@ -1779,84 +1575,26 @@ define <8 x i16> @clmul_v8i16_neon_zext(<8 x i8> %x, <8 x i8> %y) {
define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: clmul_v16i16_neon_zext:
; CHECK: // %bb.0:
-; CHECK-NEXT: ushll2 v2.8h, v1.16b, #0
+; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: rbit v4.8b, v1.8b
+; CHECK-NEXT: rbit v5.8b, v0.8b
+; CHECK-NEXT: pmul v0.8b, v0.8b, v1.8b
+; CHECK-NEXT: rbit v6.8b, v3.8b
+; CHECK-NEXT: rbit v7.8b, v2.8b
+; CHECK-NEXT: pmul v1.8b, v2.8b, v3.8b
+; CHECK-NEXT: pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: pmul v5.8b, v7.8b, v6.8b
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
-; CHECK-NEXT: mov v4.16b, v2.16b
-; CHECK-NEXT: mov v5.16b, v2.16b
-; CHECK-NEXT: mov v6.16b, v2.16b
-; CHECK-NEXT: mov v7.16b, v2.16b
-; CHECK-NEXT: mov v16.16b, v2.16b
-; CHECK-NEXT: mov v17.16b, v2.16b
-; CHECK-NEXT: mov v18.16b, v1.16b
-; CHECK-NEXT: mov v19.16b, v1.16b
-; CHECK-NEXT: mov v20.16b, v1.16b
-; CHECK-NEXT: mov v21.16b, v1.16b
-; CHECK-NEXT: mov v22.16b, v1.16b
-; CHECK-NEXT: mov v23.16b, v1.16b
-; CHECK-NEXT: bic v4.8h, #253
-; CHECK-NEXT: bic v5.8h, #254
-; CHECK-NEXT: bic v6.8h, #251
-; CHECK-NEXT: bic v7.8h, #247
-; CHECK-NEXT: mov v3.16b, v2.16b
-; CHECK-NEXT: bic v16.8h, #239
-; CHECK-NEXT: bic v17.8h, #223
-; CHECK-NEXT: bic v18.8h, #253
-; CHECK-NEXT: bic v19.8h, #254
-; CHECK-NEXT: bic v20.8h, #251
-; CHECK-NEXT: bic v21.8h, #247
-; CHECK-NEXT: bic v22.8h, #239
-; CHECK-NEXT: bic v23.8h, #223
-; CHECK-NEXT: mov v24.16b, v1.16b
-; CHECK-NEXT: uzp1 v4.16b, v0.16b, v4.16b
-; CHECK-NEXT: uzp1 v5.16b, v0.16b, v5.16b
-; CHECK-NEXT: uzp1 v6.16b, v0.16b, v6.16b
-; CHECK-NEXT: uzp1 v7.16b, v0.16b, v7.16b
-; CHECK-NEXT: bic v3.8h, #191
-; CHECK-NEXT: uzp1 v16.16b, v0.16b, v16.16b
-; CHECK-NEXT: uzp1 v17.16b, v0.16b, v17.16b
-; CHECK-NEXT: xtn v18.8b, v18.8h
-; CHECK-NEXT: xtn v19.8b, v19.8h
-; CHECK-NEXT: xtn v20.8b, v20.8h
-; CHECK-NEXT: xtn v21.8b, v21.8h
-; CHECK-NEXT: xtn v22.8b, v22.8h
-; CHECK-NEXT: xtn v23.8b, v23.8h
-; CHECK-NEXT: bic v24.8h, #191
-; CHECK-NEXT: umull2 v4.8h, v0.16b, v4.16b
-; CHECK-NEXT: umull2 v5.8h, v0.16b, v5.16b
-; CHECK-NEXT: umull2 v6.8h, v0.16b, v6.16b
-; CHECK-NEXT: umull2 v7.8h, v0.16b, v7.16b
-; CHECK-NEXT: uzp1 v3.16b, v0.16b, v3.16b
-; CHECK-NEXT: umull2 v16.8h, v0.16b, v16.16b
-; CHECK-NEXT: umull2 v17.8h, v0.16b, v17.16b
-; CHECK-NEXT: umull v18.8h, v0.8b, v18.8b
-; CHECK-NEXT: xtn v24.8b, v24.8h
-; CHECK-NEXT: umull v19.8h, v0.8b, v19.8b
-; CHECK-NEXT: umull v20.8h, v0.8b, v20.8b
-; CHECK-NEXT: umull v21.8h, v0.8b, v21.8b
-; CHECK-NEXT: umull v22.8h, v0.8b, v22.8b
-; CHECK-NEXT: umull v23.8h, v0.8b, v23.8b
-; CHECK-NEXT: bic v2.8h, #127
-; CHECK-NEXT: bic v1.8h, #127
-; CHECK-NEXT: eor v4.16b, v5.16b, v4.16b
-; CHECK-NEXT: eor v5.16b, v6.16b, v7.16b
-; CHECK-NEXT: umull2 v3.8h, v0.16b, v3.16b
-; CHECK-NEXT: eor v6.16b, v16.16b, v17.16b
-; CHECK-NEXT: umull v7.8h, v0.8b, v24.8b
-; CHECK-NEXT: eor v16.16b, v19.16b, v18.16b
-; CHECK-NEXT: eor v17.16b, v20.16b, v21.16b
-; CHECK-NEXT: eor v18.16b, v22.16b, v23.16b
-; CHECK-NEXT: uzp1 v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: xtn v1.8b, v1.8h
-; CHECK-NEXT: eor v4.16b, v4.16b, v5.16b
-; CHECK-NEXT: eor v3.16b, v6.16b, v3.16b
-; CHECK-NEXT: eor v5.16b, v16.16b, v17.16b
-; CHECK-NEXT: eor v6.16b, v18.16b, v7.16b
-; CHECK-NEXT: umull2 v2.8h, v0.16b, v2.16b
-; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b
-; CHECK-NEXT: eor v1.16b, v4.16b, v3.16b
-; CHECK-NEXT: eor v3.16b, v5.16b, v6.16b
-; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: rbit v5.8b, v5.8b
+; CHECK-NEXT: ushr v2.8b, v4.8b, #1
+; CHECK-NEXT: ushr v3.8b, v5.8b, #1
+; CHECK-NEXT: shll v2.8h, v2.8b, #8
+; CHECK-NEXT: shll v3.8h, v3.8b, #8
+; CHECK-NEXT: orr v0.16b, v0.16b, v2.16b
+; CHECK-NEXT: orr v1.16b, v1.16b, v3.16b
; CHECK-NEXT: ret
%zextx = zext <16 x i8> %x to <16 x i16>
%zexty = zext <16 x i8> %y to <16 x i16>
@@ -1867,86 +1605,74 @@ define <16 x i16> @clmul_v16i16_neon_zext(<16 x i8> %x, <16 x i8> %y) {
define <4 x i32> @clmul_v4i32_neon_zext(<4 x i16> %x, <4 x i16> %y) {
; CHECK-LABEL: clmul_v4i32_neon_zext:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.4s, #2
-; CHECK-NEXT: movi v3.4s, #1
-; CHECK-NEXT: movi v4.4s, #4
-; CHECK-NEXT: movi v5.4s, #8
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: movi v6.4s, #16
-; CHECK-NEXT: movi v7.4s, #32
-; CHECK-NEXT: movi v16.4s, #128
-; CHECK-NEXT: movi v17.4s, #1, lsl #8
-; CHECK-NEXT: movi v18.4s, #8, lsl #8
-; CHECK-NEXT: movi v19.4s, #16, lsl #8
-; CHECK-NEXT: movi v20.4s, #64
-; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT: movi v21.4s, #2, lsl #8
-; CHECK-NEXT: movi v22.4s, #32, lsl #8
-; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT: xtn v2.4h, v2.4s
-; CHECK-NEXT: xtn v3.4h, v3.4s
-; CHECK-NEXT: xtn v4.4h, v4.4s
-; CHECK-NEXT: xtn v5.4h, v5.4s
-; CHECK-NEXT: movi v23.4s, #4, lsl #8
-; CHECK-NEXT: movi v24.4s, #64, lsl #8
-; CHECK-NEXT: xtn v6.4h, v6.4s
-; CHECK-NEXT: xtn v7.4h, v7.4s
-; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT: xtn v16.4h, v16.4s
-; CHECK-NEXT: xtn v17.4h, v17.4s
-; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT: xtn v18.4h, v18.4s
-; CHECK-NEXT: xtn v19.4h, v19.4s
-; CHECK-NEXT: and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT: umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT: umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT: umull v5.4s, v0.4h, v5.4h
-; CHECK-NEXT: movi v25.4s, #128, lsl #8
-; CHECK-NEXT: xtn v20.4h, v20.4s
-; CHECK-NEXT: xtn v21.4h, v21.4s
-; CHECK-NEXT: and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT: xtn v22.4h, v22.4s
-; CHECK-NEXT: and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT: umull v6.4s, v0.4h, v6.4h
-; CHECK-NEXT: umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT: umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT: umull v17.4s, v0.4h, v17.4h
-; CHECK-NEXT: umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT: umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v25.16b
-; CHECK-NEXT: xtn v4.4h, v23.4s
-; CHECK-NEXT: xtn v5.4h, v24.4s
-; CHECK-NEXT: umull v20.4s, v0.4h, v20.4h
-; CHECK-NEXT: umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT: umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT: eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: umull v3.4s, v0.4h, v4.4h
-; CHECK-NEXT: umull v4.4s, v0.4h, v5.4h
-; CHECK-NEXT: eor v5.16b, v6.16b, v20.16b
-; CHECK-NEXT: eor v6.16b, v7.16b, v21.16b
-; CHECK-NEXT: eor v7.16b, v16.16b, v22.16b
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: eor v1.16b, v2.16b, v5.16b
-; CHECK-NEXT: eor v2.16b, v6.16b, v3.16b
-; CHECK-NEXT: eor v3.16b, v7.16b, v4.16b
-; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT: rev16 v3.8b, v1.8b
+; CHECK-NEXT: rev16 v4.8b, v0.8b
+; CHECK-NEXT: movi v2.2d, #0000000000000000
+; CHECK-NEXT: xtn v17.8b, v1.8h
+; CHECK-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-NEXT: rbit v3.8b, v3.8b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: xtn v18.8b, v2.8h
+; CHECK-NEXT: rbit v19.8b, v17.8b
+; CHECK-NEXT: shrn v2.8b, v2.8h, #8
+; CHECK-NEXT: xtn v5.8b, v3.8h
+; CHECK-NEXT: xtn v6.8b, v4.8h
+; CHECK-NEXT: shrn v4.8b, v4.8h, #8
+; CHECK-NEXT: shrn v3.8b, v3.8h, #8
+; CHECK-NEXT: rbit v20.8b, v18.8b
+; CHECK-NEXT: rbit v7.8b, v5.8b
+; CHECK-NEXT: rbit v16.8b, v6.8b
+; CHECK-NEXT: pmul v4.8b, v4.8b, v5.8b
+; CHECK-NEXT: pmul v3.8b, v6.8b, v3.8b
+; CHECK-NEXT: pmul v5.8b, v6.8b, v5.8b
+; CHECK-NEXT: pmul v6.8b, v2.8b, v17.8b
+; CHECK-NEXT: pmul v7.8b, v16.8b, v7.8b
+; CHECK-NEXT: xtn v16.8b, v0.8h
+; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: eor v3.8b, v3.8b, v4.8b
+; CHECK-NEXT: pmul v4.8b, v20.8b, v19.8b
+; CHECK-NEXT: ushll v5.8h, v5.8b, #0
+; CHECK-NEXT: rbit v7.8b, v7.8b
+; CHECK-NEXT: rbit v21.8b, v16.8b
+; CHECK-NEXT: pmul v2.8b, v16.8b, v2.8b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: ushr v7.8b, v7.8b, #1
+; CHECK-NEXT: pmul v20.8b, v21.8b, v20.8b
+; CHECK-NEXT: pmul v19.8b, v21.8b, v19.8b
+; CHECK-NEXT: ushr v4.8b, v4.8b, #1
+; CHECK-NEXT: eor v3.8b, v7.8b, v3.8b
+; CHECK-NEXT: pmul v7.8b, v18.8b, v1.8b
+; CHECK-NEXT: pmul v18.8b, v0.8b, v18.8b
+; CHECK-NEXT: rbit v20.8b, v20.8b
+; CHECK-NEXT: pmul v0.8b, v0.8b, v17.8b
+; CHECK-NEXT: pmul v1.8b, v16.8b, v1.8b
+; CHECK-NEXT: shll v3.8h, v3.8b, #8
+; CHECK-NEXT: eor v6.8b, v7.8b, v6.8b
+; CHECK-NEXT: eor v2.8b, v2.8b, v18.8b
+; CHECK-NEXT: ushr v7.8b, v20.8b, #1
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: orr v3.16b, v5.16b, v3.16b
+; CHECK-NEXT: rbit v5.8b, v19.8b
+; CHECK-NEXT: eor v4.8b, v4.8b, v6.8b
+; CHECK-NEXT: eor v2.8b, v7.8b, v2.8b
+; CHECK-NEXT: rev16 v3.8b, v3.8b
+; CHECK-NEXT: ushr v1.8b, v5.8b, #1
+; CHECK-NEXT: pmul v5.8b, v16.8b, v17.8b
+; CHECK-NEXT: shll v4.8h, v4.8b, #8
+; CHECK-NEXT: shll v2.8h, v2.8b, #8
+; CHECK-NEXT: rbit v3.8b, v3.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: eor v1.8b, v2.8b, v4.8b
+; CHECK-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEXT: ushr v2.4h, v3.4h, #1
+; CHECK-NEXT: ushll v3.8h, v5.8b, #0
+; CHECK-NEXT: eor v1.8b, v2.8b, v1.8b
+; CHECK-NEXT: orr v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: shll v1.4s, v1.4h, #16
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%zextx = zext <4 x i16> %x to <4 x i32>
%zexty = zext <4 x i16> %y to <4 x i32>
@@ -1966,152 +1692,138 @@ define <8 x i32> @clmul_v8i32_neon_zext(<8 x i16> %x, <8 x i16> %y) {
; CHECK-NEXT: .cfi_offset b10, -24
; CHECK-NEXT: .cfi_offset b11, -32
; CHECK-NEXT: .cfi_offset b12, -48
-; CHECK-NEXT: movi v19.4s, #2
-; CHECK-NEXT: movi v21.4s, #1
-; CHECK-NEXT: ushll2 v2.4s, v1.8h, #0
-; CHECK-NEXT: movi v17.4s, #4
-; CHECK-NEXT: movi v20.4s, #8
-; CHECK-NEXT: movi v5.4s, #16
-; CHECK-NEXT: movi v4.4s, #32
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: and v3.16b, v2.16b, v19.16b
-; CHECK-NEXT: and v6.16b, v2.16b, v21.16b
-; CHECK-NEXT: and v7.16b, v2.16b, v17.16b
-; CHECK-NEXT: and v16.16b, v2.16b, v20.16b
-; CHECK-NEXT: and v18.16b, v2.16b, v5.16b
-; CHECK-NEXT: and v22.16b, v2.16b, v4.16b
-; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT: uzp1 v23.8h, v0.8h, v3.8h
-; CHECK-NEXT: movi v3.4s, #64
-; CHECK-NEXT: uzp1 v24.8h, v0.8h, v6.8h
-; CHECK-NEXT: movi v6.4s, #128
-; CHECK-NEXT: uzp1 v25.8h, v0.8h, v7.8h
-; CHECK-NEXT: movi v7.4s, #1, lsl #8
-; CHECK-NEXT: uzp1 v26.8h, v0.8h, v16.8h
-; CHECK-NEXT: uzp1 v27.8h, v0.8h, v18.8h
-; CHECK-NEXT: uzp1 v28.8h, v0.8h, v22.8h
-; CHECK-NEXT: movi v16.4s, #8, lsl #8
-; CHECK-NEXT: movi v18.4s, #16, lsl #8
-; CHECK-NEXT: movi v22.4s, #2, lsl #8
-; CHECK-NEXT: umull2 v29.4s, v0.8h, v23.8h
-; CHECK-NEXT: and v23.16b, v2.16b, v3.16b
-; CHECK-NEXT: umull2 v24.4s, v0.8h, v24.8h
-; CHECK-NEXT: and v30.16b, v2.16b, v6.16b
-; CHECK-NEXT: and v31.16b, v2.16b, v7.16b
-; CHECK-NEXT: umull2 v25.4s, v0.8h, v25.8h
-; CHECK-NEXT: umull2 v26.4s, v0.8h, v26.8h
-; CHECK-NEXT: umull2 v27.4s, v0.8h, v27.8h
-; CHECK-NEXT: umull2 v28.4s, v0.8h, v28.8h
-; CHECK-NEXT: uzp1 v10.8h, v0.8h, v23.8h
-; CHECK-NEXT: movi v23.4s, #32, lsl #8
-; CHECK-NEXT: and v8.16b, v2.16b, v16.16b
-; CHECK-NEXT: and v9.16b, v2.16b, v18.16b
-; CHECK-NEXT: uzp1 v30.8h, v0.8h, v30.8h
-; CHECK-NEXT: uzp1 v31.8h, v0.8h, v31.8h
-; CHECK-NEXT: and v11.16b, v2.16b, v22.16b
-; CHECK-NEXT: eor v24.16b, v24.16b, v29.16b
-; CHECK-NEXT: xtn v12.4h, v19.4s
-; CHECK-NEXT: uzp1 v8.8h, v0.8h, v8.8h
-; CHECK-NEXT: eor v25.16b, v25.16b, v26.16b
-; CHECK-NEXT: eor v26.16b, v27.16b, v28.16b
-; CHECK-NEXT: uzp1 v9.8h, v0.8h, v9.8h
-; CHECK-NEXT: and v29.16b, v2.16b, v23.16b
-; CHECK-NEXT: umull2 v27.4s, v0.8h, v10.8h
-; CHECK-NEXT: umull2 v28.4s, v0.8h, v30.8h
-; CHECK-NEXT: uzp1 v30.8h, v0.8h, v11.8h
-; CHECK-NEXT: umull2 v31.4s, v0.8h, v31.8h
-; CHECK-NEXT: and v11.16b, v1.16b, v17.16b
-; CHECK-NEXT: eor v17.16b, v24.16b, v25.16b
-; CHECK-NEXT: and v10.16b, v1.16b, v21.16b
-; CHECK-NEXT: uzp1 v29.8h, v0.8h, v29.8h
-; CHECK-NEXT: umull2 v8.4s, v0.8h, v8.8h
-; CHECK-NEXT: movi v21.4s, #4, lsl #8
-; CHECK-NEXT: umull2 v9.4s, v0.8h, v9.8h
-; CHECK-NEXT: eor v19.16b, v26.16b, v27.16b
-; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT: umull2 v24.4s, v0.8h, v30.8h
-; CHECK-NEXT: eor v25.16b, v28.16b, v31.16b
-; CHECK-NEXT: xtn v28.4h, v11.4s
-; CHECK-NEXT: xtn v30.4h, v20.4s
-; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT: umull2 v27.4s, v0.8h, v29.8h
-; CHECK-NEXT: xtn v10.4h, v10.4s
-; CHECK-NEXT: and v29.16b, v2.16b, v21.16b
-; CHECK-NEXT: eor v26.16b, v8.16b, v9.16b
-; CHECK-NEXT: and v9.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT: eor v20.16b, v25.16b, v24.16b
-; CHECK-NEXT: and v25.16b, v1.16b, v5.16b
-; CHECK-NEXT: umull v28.4s, v0.4h, v28.4h
-; CHECK-NEXT: umull v30.4s, v0.4h, v30.4h
-; CHECK-NEXT: movi v24.4s, #64, lsl #8
-; CHECK-NEXT: xtn v7.4h, v7.4s
-; CHECK-NEXT: eor v4.16b, v26.16b, v27.16b
-; CHECK-NEXT: and v26.16b, v1.16b, v6.16b
-; CHECK-NEXT: xtn v27.4h, v9.4s
-; CHECK-NEXT: xtn v25.4h, v25.4s
-; CHECK-NEXT: and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT: xtn v16.4h, v16.4s
-; CHECK-NEXT: xtn v18.4h, v18.4s
-; CHECK-NEXT: and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT: uzp1 v5.8h, v0.8h, v29.8h
-; CHECK-NEXT: xtn v26.4h, v26.4s
-; CHECK-NEXT: eor v28.16b, v28.16b, v30.16b
-; CHECK-NEXT: movi v30.4s, #128, lsl #8
-; CHECK-NEXT: umull v27.4s, v0.4h, v27.4h
-; CHECK-NEXT: and v29.16b, v2.16b, v24.16b
-; CHECK-NEXT: xtn v3.4h, v3.4s
-; CHECK-NEXT: umull v25.4s, v0.4h, v25.4h
-; CHECK-NEXT: xtn v22.4h, v22.4s
-; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT: xtn v23.4h, v23.4s
-; CHECK-NEXT: and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT: umull v31.4s, v0.4h, v12.4h
-; CHECK-NEXT: umull v8.4s, v0.4h, v10.4h
-; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: umull v26.4s, v0.4h, v26.4h
-; CHECK-NEXT: umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT: umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT: umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT: eor v25.16b, v25.16b, v27.16b
-; CHECK-NEXT: uzp1 v27.8h, v0.8h, v29.8h
-; CHECK-NEXT: and v2.16b, v2.16b, v30.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v30.16b
-; CHECK-NEXT: xtn v21.4h, v21.4s
-; CHECK-NEXT: xtn v24.4h, v24.4s
-; CHECK-NEXT: umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT: umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT: umull v23.4s, v0.4h, v23.4h
-; CHECK-NEXT: eor v6.16b, v8.16b, v31.16b
-; CHECK-NEXT: eor v7.16b, v26.16b, v7.16b
+; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT: rev16 v5.8b, v1.8b
+; CHECK-NEXT: rev16 v6.8b, v0.8b
+; CHECK-NEXT: movi v4.2d, #0000000000000000
+; CHECK-NEXT: rev16 v7.8b, v3.8b
+; CHECK-NEXT: rev16 v17.8b, v2.8b
+; CHECK-NEXT: rbit v18.8b, v5.8b
+; CHECK-NEXT: rbit v19.8b, v6.8b
+; CHECK-NEXT: xtn v5.8b, v1.8h
+; CHECK-NEXT: xtn v16.8b, v4.8h
+; CHECK-NEXT: shrn v29.8b, v4.8h, #8
+; CHECK-NEXT: xtn v6.8b, v0.8h
+; CHECK-NEXT: shrn v4.8b, v0.8h, #8
+; CHECK-NEXT: xtn v0.8b, v3.8h
+; CHECK-NEXT: shrn v3.8b, v3.8h, #8
+; CHECK-NEXT: rbit v20.8b, v7.8b
+; CHECK-NEXT: rbit v17.8b, v17.8b
+; CHECK-NEXT: xtn v21.8b, v18.8h
+; CHECK-NEXT: xtn v22.8b, v19.8h
+; CHECK-NEXT: shrn v7.8b, v1.8h, #8
+; CHECK-NEXT: shrn v1.8b, v19.8h, #8
+; CHECK-NEXT: shrn v18.8b, v18.8h, #8
+; CHECK-NEXT: pmul v8.8b, v29.8b, v5.8b
+; CHECK-NEXT: rbit v23.8b, v5.8b
+; CHECK-NEXT: rbit v24.8b, v16.8b
+; CHECK-NEXT: pmul v12.8b, v4.8b, v16.8b
+; CHECK-NEXT: pmul v4.8b, v4.8b, v5.8b
+; CHECK-NEXT: xtn v25.8b, v20.8h
+; CHECK-NEXT: xtn v26.8b, v17.8h
+; CHECK-NEXT: rbit v27.8b, v21.8b
+; CHECK-NEXT: rbit v28.8b, v22.8b
+; CHECK-NEXT: pmul v10.8b, v1.8b, v21.8b
+; CHECK-NEXT: shrn v17.8b, v17.8h, #8
+; CHECK-NEXT: pmul v18.8b, v22.8b, v18.8b
+; CHECK-NEXT: shrn v20.8b, v20.8h, #8
+; CHECK-NEXT: pmul v9.8b, v16.8b, v7.8b
+; CHECK-NEXT: xtn v1.8b, v2.8h
+; CHECK-NEXT: pmul v21.8b, v22.8b, v21.8b
+; CHECK-NEXT: pmul v19.8b, v24.8b, v23.8b
+; CHECK-NEXT: rbit v30.8b, v25.8b
+; CHECK-NEXT: rbit v31.8b, v26.8b
+; CHECK-NEXT: pmul v17.8b, v17.8b, v25.8b
+; CHECK-NEXT: pmul v27.8b, v28.8b, v27.8b
+; CHECK-NEXT: pmul v20.8b, v26.8b, v20.8b
+; CHECK-NEXT: rbit v28.8b, v6.8b
+; CHECK-NEXT: eor v18.8b, v18.8b, v10.8b
+; CHECK-NEXT: eor v8.8b, v9.8b, v8.8b
+; CHECK-NEXT: rbit v9.8b, v0.8b
+; CHECK-NEXT: rbit v10.8b, v1.8b
+; CHECK-NEXT: pmul v22.8b, v26.8b, v25.8b
+; CHECK-NEXT: shrn v2.8b, v2.8h, #8
+; CHECK-NEXT: pmul v30.8b, v31.8b, v30.8b
+; CHECK-NEXT: ushll v21.8h, v21.8b, #0
+; CHECK-NEXT: rbit v19.8b, v19.8b
+; CHECK-NEXT: rbit v27.8b, v27.8b
+; CHECK-NEXT: eor v17.8b, v20.8b, v17.8b
+; CHECK-NEXT: pmul v11.8b, v28.8b, v24.8b
+; CHECK-NEXT: pmul v25.8b, v24.8b, v9.8b
+; CHECK-NEXT: pmul v31.8b, v6.8b, v29.8b
+; CHECK-NEXT: pmul v7.8b, v6.8b, v7.8b
+; CHECK-NEXT: pmul v24.8b, v10.8b, v24.8b
+; CHECK-NEXT: ushll v22.8h, v22.8b, #0
+; CHECK-NEXT: pmul v5.8b, v6.8b, v5.8b
+; CHECK-NEXT: rbit v30.8b, v30.8b
+; CHECK-NEXT: ushr v19.8b, v19.8b, #1
+; CHECK-NEXT: ushr v27.8b, v27.8b, #1
+; CHECK-NEXT: rbit v11.8b, v11.8b
+; CHECK-NEXT: rbit v25.8b, v25.8b
+; CHECK-NEXT: eor v31.8b, v31.8b, v12.8b
+; CHECK-NEXT: eor v4.8b, v7.8b, v4.8b
+; CHECK-NEXT: rbit v24.8b, v24.8b
+; CHECK-NEXT: eor v19.8b, v19.8b, v8.8b
+; CHECK-NEXT: ushll v5.8h, v5.8b, #0
+; CHECK-NEXT: eor v18.8b, v27.8b, v18.8b
+; CHECK-NEXT: ushr v20.8b, v30.8b, #1
+; CHECK-NEXT: pmul v27.8b, v16.8b, v3.8b
+; CHECK-NEXT: pmul v16.8b, v2.8b, v16.8b
+; CHECK-NEXT: pmul v2.8b, v2.8b, v0.8b
+; CHECK-NEXT: pmul v3.8b, v1.8b, v3.8b
+; CHECK-NEXT: ushr v26.8b, v11.8b, #1
+; CHECK-NEXT: shll v19.8h, v19.8b, #8
+; CHECK-NEXT: shll v18.8h, v18.8b, #8
+; CHECK-NEXT: eor v17.8b, v20.8b, v17.8b
+; CHECK-NEXT: pmul v20.8b, v28.8b, v23.8b
+; CHECK-NEXT: pmul v28.8b, v1.8b, v29.8b
+; CHECK-NEXT: pmul v23.8b, v29.8b, v0.8b
+; CHECK-NEXT: ushr v24.8b, v24.8b, #1
+; CHECK-NEXT: eor v26.8b, v26.8b, v31.8b
+; CHECK-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: pmul v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: shll v17.8h, v17.8b, #8
+; CHECK-NEXT: orr v18.16b, v21.16b, v18.16b
+; CHECK-NEXT: pmul v21.8b, v10.8b, v9.8b
; CHECK-NEXT: ldp d9, d8, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT: eor v16.16b, v16.16b, v18.16b
-; CHECK-NEXT: uzp1 v2.8h, v0.8h, v2.8h
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: umull2 v5.4s, v0.8h, v5.8h
-; CHECK-NEXT: umull2 v18.4s, v0.8h, v27.8h
-; CHECK-NEXT: umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT: umull v24.4s, v0.4h, v24.4h
-; CHECK-NEXT: eor v6.16b, v6.16b, v28.16b
-; CHECK-NEXT: eor v3.16b, v25.16b, v3.16b
-; CHECK-NEXT: eor v7.16b, v7.16b, v22.16b
-; CHECK-NEXT: eor v16.16b, v16.16b, v23.16b
-; CHECK-NEXT: eor v17.16b, v17.16b, v19.16b
-; CHECK-NEXT: umull2 v2.4s, v0.8h, v2.8h
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: eor v5.16b, v20.16b, v5.16b
-; CHECK-NEXT: eor v4.16b, v4.16b, v18.16b
-; CHECK-NEXT: eor v1.16b, v6.16b, v3.16b
-; CHECK-NEXT: eor v3.16b, v7.16b, v21.16b
-; CHECK-NEXT: eor v6.16b, v16.16b, v24.16b
-; CHECK-NEXT: eor v5.16b, v17.16b, v5.16b
-; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT: eor v3.16b, v1.16b, v3.16b
-; CHECK-NEXT: eor v0.16b, v6.16b, v0.16b
-; CHECK-NEXT: eor v1.16b, v5.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
+; CHECK-NEXT: rbit v20.8b, v20.8b
+; CHECK-NEXT: ldp d11, d10, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT: eor v16.8b, v28.8b, v16.8b
+; CHECK-NEXT: orr v17.16b, v22.16b, v17.16b
+; CHECK-NEXT: eor v23.8b, v27.8b, v23.8b
+; CHECK-NEXT: ushr v22.8b, v25.8b, #1
+; CHECK-NEXT: rbit v21.8b, v21.8b
+; CHECK-NEXT: rev16 v18.8b, v18.8b
+; CHECK-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-NEXT: ushr v7.8b, v20.8b, #1
+; CHECK-NEXT: eor v16.8b, v24.8b, v16.8b
+; CHECK-NEXT: rev16 v17.8b, v17.8b
+; CHECK-NEXT: eor v20.8b, v22.8b, v23.8b
+; CHECK-NEXT: shll v22.8h, v26.8b, #8
+; CHECK-NEXT: ushr v3.8b, v21.8b, #1
+; CHECK-NEXT: rbit v18.8b, v18.8b
+; CHECK-NEXT: eor v4.8b, v7.8b, v4.8b
+; CHECK-NEXT: shll v7.8h, v16.8b, #8
+; CHECK-NEXT: shll v6.8h, v20.8b, #8
+; CHECK-NEXT: rbit v16.8b, v17.8b
+; CHECK-NEXT: eor v1.8b, v3.8b, v2.8b
+; CHECK-NEXT: eor v2.8b, v22.8b, v19.8b
+; CHECK-NEXT: shll v4.8h, v4.8b, #8
+; CHECK-NEXT: ushr v3.4h, v18.4h, #1
+; CHECK-NEXT: eor v6.8b, v7.8b, v6.8b
+; CHECK-NEXT: ushr v7.4h, v16.4h, #1
+; CHECK-NEXT: shll v1.8h, v1.8b, #8
+; CHECK-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: orr v3.16b, v5.16b, v4.16b
+; CHECK-NEXT: eor v4.8b, v7.8b, v6.8b
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEXT: shll v1.4s, v2.4h, #16
+; CHECK-NEXT: ushll v2.4s, v3.4h, #0
+; CHECK-NEXT: shll v3.4s, v4.4h, #16
+; CHECK-NEXT: ushll v4.4s, v0.4h, #0
+; CHECK-NEXT: orr v0.16b, v2.16b, v1.16b
+; CHECK-NEXT: orr v1.16b, v4.16b, v3.16b
; CHECK-NEXT: ldr d12, [sp], #48 // 8-byte Folded Reload
; CHECK-NEXT: ret
%zextx = zext <8 x i16> %x to <8 x i32>
@@ -4525,72 +4237,26 @@ define <8 x i16> @clmulr_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
; CHECK-LABEL: clmulr_v8i16_neon:
; CHECK: // %bb.0:
; CHECK-NEXT: rev16 v1.16b, v1.16b
-; CHECK-NEXT: rev16 v3.16b, v0.16b
-; CHECK-NEXT: movi v2.8h, #2
-; CHECK-NEXT: movi v4.8h, #1
-; CHECK-NEXT: movi v5.8h, #4
-; CHECK-NEXT: movi v6.8h, #8
-; CHECK-NEXT: movi v7.8h, #16
-; CHECK-NEXT: movi v16.8h, #32
-; CHECK-NEXT: movi v17.8h, #128
-; CHECK-NEXT: movi v18.8h, #1, lsl #8
-; CHECK-NEXT: movi v19.8h, #8, lsl #8
-; CHECK-NEXT: movi v20.8h, #16, lsl #8
-; CHECK-NEXT: rbit v0.16b, v1.16b
-; CHECK-NEXT: rbit v1.16b, v3.16b
-; CHECK-NEXT: movi v3.8h, #64
-; CHECK-NEXT: movi v21.8h, #2, lsl #8
-; CHECK-NEXT: movi v22.8h, #32, lsl #8
-; CHECK-NEXT: movi v23.8h, #4, lsl #8
-; CHECK-NEXT: movi v24.8h, #64, lsl #8
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT: and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT: and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT: and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT: and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT: and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT: and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT: mul v2.8h, v1.8h, v2.8h
-; CHECK-NEXT: mul v4.8h, v1.8h, v4.8h
-; CHECK-NEXT: mul v5.8h, v1.8h, v5.8h
-; CHECK-NEXT: mul v6.8h, v1.8h, v6.8h
-; CHECK-NEXT: mul v7.8h, v1.8h, v7.8h
-; CHECK-NEXT: mul v16.8h, v1.8h, v16.8h
-; CHECK-NEXT: and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT: mul v17.8h, v1.8h, v17.8h
-; CHECK-NEXT: mul v18.8h, v1.8h, v18.8h
-; CHECK-NEXT: and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT: mul v19.8h, v1.8h, v19.8h
-; CHECK-NEXT: mul v20.8h, v1.8h, v20.8h
-; CHECK-NEXT: and v22.16b, v0.16b, v22.16b
-; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT: eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT: movi v6.8h, #128, lsl #8
-; CHECK-NEXT: mul v3.8h, v1.8h, v3.8h
-; CHECK-NEXT: mul v5.8h, v1.8h, v21.8h
-; CHECK-NEXT: and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT: and v23.16b, v0.16b, v24.16b
-; CHECK-NEXT: mul v22.8h, v1.8h, v22.8h
-; CHECK-NEXT: eor v7.16b, v7.16b, v16.16b
-; CHECK-NEXT: eor v16.16b, v17.16b, v18.16b
-; CHECK-NEXT: eor v17.16b, v19.16b, v20.16b
-; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v6.16b
-; CHECK-NEXT: mul v4.8h, v1.8h, v21.8h
-; CHECK-NEXT: mul v6.8h, v1.8h, v23.8h
-; CHECK-NEXT: eor v3.16b, v7.16b, v3.16b
-; CHECK-NEXT: eor v5.16b, v16.16b, v5.16b
-; CHECK-NEXT: eor v7.16b, v17.16b, v22.16b
-; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h
-; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v2.16b, v5.16b, v4.16b
-; CHECK-NEXT: eor v3.16b, v7.16b, v6.16b
-; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: rev16 v0.16b, v0.16b
+; CHECK-NEXT: rbit v1.16b, v1.16b
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: xtn v2.8b, v1.8h
+; CHECK-NEXT: xtn v3.8b, v0.8h
+; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-NEXT: rbit v4.8b, v2.8b
+; CHECK-NEXT: rbit v5.8b, v3.8b
+; CHECK-NEXT: pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT: pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: ushr v1.8b, v4.8b, #1
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: ushll v1.8h, v2.8b, #0
+; CHECK-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: rev16 v0.16b, v0.16b
; CHECK-NEXT: rbit v0.16b, v0.16b
; CHECK-NEXT: ret
@@ -4605,87 +4271,29 @@ define <8 x i16> @clmulr_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
define <4 x i16> @clmulr_v4i16_neon(<4 x i16> %a, <4 x i16> %b) nounwind {
; CHECK-LABEL: clmulr_v4i16_neon:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.4s, #2
-; CHECK-NEXT: movi v3.4s, #1
-; CHECK-NEXT: movi v4.4s, #4
-; CHECK-NEXT: movi v5.4s, #8
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: movi v6.4s, #16
-; CHECK-NEXT: movi v7.4s, #32
-; CHECK-NEXT: movi v16.4s, #128
-; CHECK-NEXT: movi v17.4s, #1, lsl #8
-; CHECK-NEXT: movi v18.4s, #8, lsl #8
-; CHECK-NEXT: movi v19.4s, #16, lsl #8
-; CHECK-NEXT: movi v20.4s, #64
-; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT: movi v21.4s, #2, lsl #8
-; CHECK-NEXT: movi v22.4s, #32, lsl #8
-; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT: xtn v2.4h, v2.4s
-; CHECK-NEXT: xtn v3.4h, v3.4s
-; CHECK-NEXT: xtn v4.4h, v4.4s
-; CHECK-NEXT: xtn v5.4h, v5.4s
-; CHECK-NEXT: movi v23.4s, #4, lsl #8
-; CHECK-NEXT: movi v24.4s, #64, lsl #8
-; CHECK-NEXT: xtn v6.4h, v6.4s
-; CHECK-NEXT: xtn v7.4h, v7.4s
-; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT: xtn v16.4h, v16.4s
-; CHECK-NEXT: xtn v17.4h, v17.4s
-; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT: xtn v18.4h, v18.4s
-; CHECK-NEXT: xtn v19.4h, v19.4s
-; CHECK-NEXT: and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT: umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT: umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT: umull v5.4s, v0.4h, v5.4h
-; CHECK-NEXT: movi v25.4s, #128, lsl #8
-; CHECK-NEXT: xtn v20.4h, v20.4s
-; CHECK-NEXT: xtn v21.4h, v21.4s
-; CHECK-NEXT: and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT: xtn v22.4h, v22.4s
-; CHECK-NEXT: and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT: umull v6.4s, v0.4h, v6.4h
-; CHECK-NEXT: umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT: umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT: umull v17.4s, v0.4h, v17.4h
-; CHECK-NEXT: umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT: umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v25.16b
-; CHECK-NEXT: xtn v4.4h, v23.4s
-; CHECK-NEXT: xtn v5.4h, v24.4s
-; CHECK-NEXT: umull v20.4s, v0.4h, v20.4h
-; CHECK-NEXT: umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT: umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT: eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: umull v3.4s, v0.4h, v4.4h
-; CHECK-NEXT: umull v4.4s, v0.4h, v5.4h
-; CHECK-NEXT: eor v5.16b, v6.16b, v20.16b
-; CHECK-NEXT: eor v6.16b, v7.16b, v21.16b
-; CHECK-NEXT: eor v7.16b, v16.16b, v22.16b
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: eor v1.16b, v2.16b, v5.16b
-; CHECK-NEXT: eor v2.16b, v6.16b, v3.16b
-; CHECK-NEXT: eor v3.16b, v7.16b, v4.16b
-; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: shrn v0.4h, v0.4s, #15
+; CHECK-NEXT: rev16 v1.8b, v1.8b
+; CHECK-NEXT: rev16 v0.8b, v0.8b
+; CHECK-NEXT: rbit v1.8b, v1.8b
+; CHECK-NEXT: rbit v0.8b, v0.8b
+; CHECK-NEXT: xtn v2.8b, v1.8h
+; CHECK-NEXT: xtn v3.8b, v0.8h
+; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-NEXT: rbit v4.8b, v2.8b
+; CHECK-NEXT: rbit v5.8b, v3.8b
+; CHECK-NEXT: pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT: pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: ushr v1.8b, v4.8b, #1
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: ushll v1.8h, v2.8b, #0
+; CHECK-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: rev16 v0.8b, v0.8b
+; CHECK-NEXT: rbit v0.8b, v0.8b
; CHECK-NEXT: ret
%a.ext = zext <4 x i16> %a to <4 x i32>
%b.ext = zext <4 x i16> %b to <4 x i32>
@@ -4699,136 +4307,87 @@ define <4 x i32> @clmulr_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
; CHECK-LABEL: clmulr_v4i32_neon:
; CHECK: // %bb.0:
; CHECK-NEXT: rev32 v1.16b, v1.16b
-; CHECK-NEXT: rev32 v2.16b, v0.16b
-; CHECK-NEXT: movi v3.4s, #2
-; CHECK-NEXT: movi v4.4s, #1
-; CHECK-NEXT: movi v5.4s, #4
-; CHECK-NEXT: movi v6.4s, #8
-; CHECK-NEXT: movi v7.4s, #16
-; CHECK-NEXT: movi v16.4s, #32
-; CHECK-NEXT: movi v17.4s, #64
-; CHECK-NEXT: movi v18.4s, #1, lsl #8
-; CHECK-NEXT: movi v19.4s, #2, lsl #8
-; CHECK-NEXT: movi v20.4s, #8, lsl #8
-; CHECK-NEXT: rbit v0.16b, v1.16b
-; CHECK-NEXT: rbit v1.16b, v2.16b
-; CHECK-NEXT: movi v2.4s, #128
-; CHECK-NEXT: movi v21.4s, #16, lsl #8
-; CHECK-NEXT: movi v22.4s, #8, lsl #16
-; CHECK-NEXT: movi v23.4s, #2, lsl #24
-; CHECK-NEXT: movi v25.4s, #4, lsl #24
-; CHECK-NEXT: movi v24.4s, #32, lsl #16
-; CHECK-NEXT: movi v26.4s, #8, lsl #24
-; CHECK-NEXT: and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT: and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT: and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT: and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT: and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT: mul v5.4s, v1.4s, v5.4s
-; CHECK-NEXT: mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT: mul v2.4s, v1.4s, v2.4s
-; CHECK-NEXT: and v23.16b, v0.16b, v23.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT: eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT: eor v6.16b, v7.16b, v16.16b
-; CHECK-NEXT: mul v5.4s, v1.4s, v18.4s
-; CHECK-NEXT: and v7.16b, v0.16b, v19.16b
-; CHECK-NEXT: movi v18.4s, #32, lsl #8
-; CHECK-NEXT: and v16.16b, v0.16b, v20.16b
-; CHECK-NEXT: movi v19.4s, #1, lsl #16
-; CHECK-NEXT: movi v20.4s, #4, lsl #8
-; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b
-; CHECK-NEXT: and v4.16b, v0.16b, v21.16b
-; CHECK-NEXT: eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT: movi v17.4s, #2, lsl #16
-; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT: eor v5.16b, v2.16b, v5.16b
-; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT: and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT: movi v21.4s, #64, lsl #8
-; CHECK-NEXT: mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT: eor v2.16b, v3.16b, v6.16b
-; CHECK-NEXT: and v3.16b, v0.16b, v19.16b
-; CHECK-NEXT: movi v19.4s, #128, lsl #16
-; CHECK-NEXT: and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT: and v6.16b, v0.16b, v17.16b
-; CHECK-NEXT: movi v17.4s, #64, lsl #16
-; CHECK-NEXT: eor v5.16b, v5.16b, v7.16b
-; CHECK-NEXT: mul v7.4s, v1.4s, v18.4s
-; CHECK-NEXT: movi v18.4s, #4, lsl #16
-; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: eor v4.16b, v16.16b, v4.16b
-; CHECK-NEXT: and v16.16b, v0.16b, v21.16b
-; CHECK-NEXT: movi v21.4s, #1, lsl #24
-; CHECK-NEXT: and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT: mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT: mul v20.4s, v1.4s, v20.4s
-; CHECK-NEXT: and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT: eor v4.16b, v4.16b, v7.16b
-; CHECK-NEXT: and v7.16b, v0.16b, v18.16b
-; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT: mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT: and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT: movi v18.4s, #128, lsl #8
-; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT: eor v5.16b, v5.16b, v20.16b
-; CHECK-NEXT: mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT: and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT: movi v22.4s, #16, lsl #16
-; CHECK-NEXT: mul v21.4s, v1.4s, v21.4s
-; CHECK-NEXT: eor v4.16b, v4.16b, v16.16b
-; CHECK-NEXT: and v20.16b, v0.16b, v24.16b
-; CHECK-NEXT: movi v24.4s, #64, lsl #24
-; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT: eor v16.16b, v17.16b, v19.16b
-; CHECK-NEXT: and v17.16b, v0.16b, v18.16b
-; CHECK-NEXT: mul v18.4s, v1.4s, v23.4s
-; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT: mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT: and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT: and v19.16b, v0.16b, v25.16b
-; CHECK-NEXT: movi v22.4s, #16, lsl #24
-; CHECK-NEXT: movi v23.4s, #32, lsl #24
-; CHECK-NEXT: eor v16.16b, v16.16b, v21.16b
-; CHECK-NEXT: and v21.16b, v0.16b, v26.16b
-; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT: mul v5.4s, v1.4s, v20.4s
-; CHECK-NEXT: mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT: eor v6.16b, v16.16b, v18.16b
-; CHECK-NEXT: movi v16.4s, #128, lsl #24
-; CHECK-NEXT: mul v18.4s, v1.4s, v21.4s
-; CHECK-NEXT: and v20.16b, v0.16b, v22.16b
-; CHECK-NEXT: and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT: and v22.16b, v0.16b, v24.16b
-; CHECK-NEXT: eor v4.16b, v4.16b, v17.16b
-; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT: eor v6.16b, v6.16b, v19.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v16.16b
-; CHECK-NEXT: mul v7.4s, v1.4s, v20.4s
-; CHECK-NEXT: mul v16.4s, v1.4s, v21.4s
-; CHECK-NEXT: mul v17.4s, v1.4s, v22.4s
-; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT: eor v3.16b, v3.16b, v5.16b
-; CHECK-NEXT: eor v4.16b, v6.16b, v18.16b
-; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v2.16b, v4.16b, v7.16b
-; CHECK-NEXT: eor v3.16b, v16.16b, v17.16b
-; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: rev32 v0.16b, v0.16b
+; CHECK-NEXT: rbit v1.16b, v1.16b
+; CHECK-NEXT: rbit v2.16b, v0.16b
+; CHECK-NEXT: xtn v0.4h, v1.4s
+; CHECK-NEXT: xtn v3.4h, v2.4s
+; CHECK-NEXT: shrn v16.4h, v2.4s, #16
+; CHECK-NEXT: shrn v17.4h, v1.4s, #16
+; CHECK-NEXT: xtn v20.8b, v16.8h
+; CHECK-NEXT: shrn v16.8b, v16.8h, #8
+; CHECK-NEXT: rev16 v4.8b, v0.8b
+; CHECK-NEXT: rev16 v5.8b, v3.8b
+; CHECK-NEXT: xtn v1.8b, v0.8h
+; CHECK-NEXT: xtn v21.8b, v17.8h
+; CHECK-NEXT: xtn v2.8b, v3.8h
+; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: shrn v3.8b, v3.8h, #8
+; CHECK-NEXT: shrn v17.8b, v17.8h, #8
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: rbit v5.8b, v5.8b
+; CHECK-NEXT: rbit v22.8b, v1.8b
+; CHECK-NEXT: rbit v23.8b, v21.8b
+; CHECK-NEXT: rbit v24.8b, v2.8b
+; CHECK-NEXT: pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEXT: pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEXT: pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEXT: pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEXT: xtn v6.8b, v4.8h
+; CHECK-NEXT: xtn v7.8b, v5.8h
+; CHECK-NEXT: shrn v5.8b, v5.8h, #8
+; CHECK-NEXT: shrn v4.8b, v4.8h, #8
+; CHECK-NEXT: pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEXT: rbit v18.8b, v6.8b
+; CHECK-NEXT: rbit v19.8b, v7.8b
+; CHECK-NEXT: pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEXT: pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEXT: pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEXT: rbit v7.8b, v23.8b
+; CHECK-NEXT: pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEXT: rbit v19.8b, v20.8b
+; CHECK-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT: ushll v6.8h, v6.8b, #0
+; CHECK-NEXT: ushr v7.8b, v7.8b, #1
+; CHECK-NEXT: rbit v18.8b, v18.8b
+; CHECK-NEXT: pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEXT: ushr v5.8b, v18.8b, #1
+; CHECK-NEXT: rbit v18.8b, v19.8b
+; CHECK-NEXT: pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEXT: pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEXT: eor v4.8b, v5.8b, v4.8b
+; CHECK-NEXT: eor v5.8b, v25.8b, v16.8b
+; CHECK-NEXT: eor v16.8b, v17.8b, v19.8b
+; CHECK-NEXT: pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEXT: ushr v18.8b, v18.8b, #1
+; CHECK-NEXT: eor v0.8b, v0.8b, v3.8b
+; CHECK-NEXT: shll v4.8h, v4.8b, #8
+; CHECK-NEXT: eor v5.8b, v18.8b, v5.8b
+; CHECK-NEXT: pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEXT: pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEXT: pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEXT: orr v4.16b, v6.16b, v4.16b
+; CHECK-NEXT: rbit v6.8b, v17.8b
+; CHECK-NEXT: shll v5.8h, v5.8b, #8
+; CHECK-NEXT: shll v7.8h, v7.8b, #8
+; CHECK-NEXT: ushll v17.8h, v18.8b, #0
+; CHECK-NEXT: rev16 v4.8b, v4.8b
+; CHECK-NEXT: ushll v16.8h, v16.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: ushr v3.8b, v6.8b, #1
+; CHECK-NEXT: orr v5.16b, v17.16b, v5.16b
+; CHECK-NEXT: orr v6.16b, v16.16b, v7.16b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: eor v0.8b, v3.8b, v0.8b
+; CHECK-NEXT: eor v2.8b, v6.8b, v5.8b
+; CHECK-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEXT: ushr v3.4h, v4.4h, #1
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: shll v1.4s, v2.4h, #16
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: rev32 v0.16b, v0.16b
; CHECK-NEXT: rbit v0.16b, v0.16b
; CHECK-NEXT: ret
@@ -4843,209 +4402,106 @@ define <4 x i32> @clmulr_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
define <2 x i32> @clmulr_v2i32_neon(<2 x i32> %a, <2 x i32> %b) nounwind {
; CHECK-NEON-LABEL: clmulr_v2i32_neon:
; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: mov w8, #2 // =0x2
-; CHECK-NEON-NEXT: mov w9, #1 // =0x1
-; CHECK-NEON-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEON-NEXT: dup v2.2d, x8
-; CHECK-NEON-NEXT: dup v3.2d, x9
-; CHECK-NEON-NEXT: mov w8, #4 // =0x4
-; CHECK-NEON-NEXT: mov w9, #8 // =0x8
-; CHECK-NEON-NEXT: dup v4.2d, x8
-; CHECK-NEON-NEXT: mov w8, #16 // =0x10
-; CHECK-NEON-NEXT: dup v5.2d, x9
-; CHECK-NEON-NEXT: dup v6.2d, x8
-; CHECK-NEON-NEXT: mov w8, #32 // =0x20
-; CHECK-NEON-NEXT: and v2.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT: and v3.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT: dup v7.2d, x8
-; CHECK-NEON-NEXT: and v4.16b, v1.16b, v4.16b
-; CHECK-NEON-NEXT: mov w8, #64 // =0x40
-; CHECK-NEON-NEXT: mov w9, #512 // =0x200
-; CHECK-NEON-NEXT: and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT: dup v16.2d, x8
-; CHECK-NEON-NEXT: xtn v2.2s, v2.2d
-; CHECK-NEON-NEXT: xtn v3.2s, v3.2d
-; CHECK-NEON-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT: xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT: mov w8, #128 // =0x80
-; CHECK-NEON-NEXT: xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT: xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT: dup v17.2d, x8
-; CHECK-NEON-NEXT: xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT: mov w8, #256 // =0x100
-; CHECK-NEON-NEXT: and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT: umull v2.2d, v0.2s, v2.2s
-; CHECK-NEON-NEXT: umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT: dup v18.2d, x8
-; CHECK-NEON-NEXT: umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT: mov w8, #2048 // =0x800
-; CHECK-NEON-NEXT: and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT: umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT: umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT: umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT: eor v2.16b, v3.16b, v2.16b
-; CHECK-NEON-NEXT: xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT: eor v3.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT: xtn v4.2s, v16.2d
-; CHECK-NEON-NEXT: dup v16.2d, x8
-; CHECK-NEON-NEXT: mov w8, #4096 // =0x1000
-; CHECK-NEON-NEXT: and v5.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT: dup v18.2d, x9
-; CHECK-NEON-NEXT: dup v19.2d, x8
-; CHECK-NEON-NEXT: mov w8, #8192 // =0x2000
-; CHECK-NEON-NEXT: umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT: xtn v3.2s, v5.2d
-; CHECK-NEON-NEXT: eor v5.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT: and v6.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT: and v16.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT: and v7.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT: dup v18.2d, x8
-; CHECK-NEON-NEXT: mov w8, #1024 // =0x400
-; CHECK-NEON-NEXT: umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT: dup v19.2d, x8
-; CHECK-NEON-NEXT: xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT: mov w8, #16384 // =0x4000
-; CHECK-NEON-NEXT: xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT: and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT: dup v20.2d, x8
-; CHECK-NEON-NEXT: mov w8, #65536 // =0x10000
-; CHECK-NEON-NEXT: umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT: xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT: dup v21.2d, x8
-; CHECK-NEON-NEXT: mov w8, #131072 // =0x20000
-; CHECK-NEON-NEXT: and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT: eor v4.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT: umull v5.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT: dup v6.2d, x8
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT: mov w8, #32768 // =0x8000
-; CHECK-NEON-NEXT: xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT: eor v3.16b, v17.16b, v3.16b
-; CHECK-NEON-NEXT: xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT: and v19.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT: dup v21.2d, x8
-; CHECK-NEON-NEXT: mov w8, #262144 // =0x40000
-; CHECK-NEON-NEXT: umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT: xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT: dup v18.2d, x8
-; CHECK-NEON-NEXT: mov w8, #4194304 // =0x400000
-; CHECK-NEON-NEXT: xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT: xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT: dup v22.2d, x8
-; CHECK-NEON-NEXT: mov w8, #8388608 // =0x800000
-; CHECK-NEON-NEXT: umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT: dup v23.2d, x8
-; CHECK-NEON-NEXT: mov w8, #524288 // =0x80000
-; CHECK-NEON-NEXT: umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT: and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT: eor v3.16b, v3.16b, v7.16b
-; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT: and v7.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT: umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT: and v19.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT: and v21.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT: xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT: eor v4.16b, v3.16b, v17.16b
-; CHECK-NEON-NEXT: movi v23.4s, #128, lsl #24
-; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v20.16b
-; CHECK-NEON-NEXT: xtn v5.2s, v7.2d
-; CHECK-NEON-NEXT: dup v7.2d, x8
-; CHECK-NEON-NEXT: mov w8, #16777216 // =0x1000000
-; CHECK-NEON-NEXT: xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT: xtn v19.2s, v21.2d
-; CHECK-NEON-NEXT: dup v20.2d, x8
-; CHECK-NEON-NEXT: mov w8, #33554432 // =0x2000000
-; CHECK-NEON-NEXT: eor v6.16b, v16.16b, v6.16b
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT: dup v18.2d, x8
-; CHECK-NEON-NEXT: mov w8, #1048576 // =0x100000
-; CHECK-NEON-NEXT: dup v21.2d, x8
-; CHECK-NEON-NEXT: mov w8, #2097152 // =0x200000
-; CHECK-NEON-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT: dup v22.2d, x8
-; CHECK-NEON-NEXT: mov w8, #67108864 // =0x4000000
-; CHECK-NEON-NEXT: umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT: umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT: fneg v23.2d, v23.2d
-; CHECK-NEON-NEXT: eor v6.16b, v6.16b, v16.16b
-; CHECK-NEON-NEXT: and v16.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT: xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT: xtn v18.2s, v20.2d
-; CHECK-NEON-NEXT: dup v20.2d, x8
-; CHECK-NEON-NEXT: mov w8, #134217728 // =0x8000000
-; CHECK-NEON-NEXT: and v21.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT: and v22.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT: umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v19.16b
-; CHECK-NEON-NEXT: xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT: dup v19.2d, x8
-; CHECK-NEON-NEXT: mov w8, #268435456 // =0x10000000
-; CHECK-NEON-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT: umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT: umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT: dup v24.2d, x8
-; CHECK-NEON-NEXT: mov w8, #536870912 // =0x20000000
-; CHECK-NEON-NEXT: dup v25.2d, x8
-; CHECK-NEON-NEXT: mov w8, #1073741824 // =0x40000000
-; CHECK-NEON-NEXT: and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT: xtn v21.2s, v21.2d
-; CHECK-NEON-NEXT: xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT: dup v26.2d, x8
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT: eor v6.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v18.16b
-; CHECK-NEON-NEXT: xtn v18.2s, v22.2d
-; CHECK-NEON-NEXT: xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT: and v22.16b, v1.16b, v24.16b
-; CHECK-NEON-NEXT: and v24.16b, v1.16b, v25.16b
-; CHECK-NEON-NEXT: and v25.16b, v1.16b, v26.16b
-; CHECK-NEON-NEXT: umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT: umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT: and v1.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT: eor v7.16b, v17.16b, v16.16b
-; CHECK-NEON-NEXT: eor v3.16b, v3.16b, v5.16b
-; CHECK-NEON-NEXT: xtn v16.2s, v22.2d
-; CHECK-NEON-NEXT: xtn v17.2s, v24.2d
-; CHECK-NEON-NEXT: xtn v22.2s, v25.2d
-; CHECK-NEON-NEXT: umull v4.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT: umull v18.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEON-NEXT: eor v5.16b, v6.16b, v21.16b
-; CHECK-NEON-NEXT: eor v6.16b, v7.16b, v20.16b
-; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT: umull v7.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT: umull v17.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT: eor v4.16b, v6.16b, v18.16b
-; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT: eor v1.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT: eor v2.16b, v4.16b, v7.16b
-; CHECK-NEON-NEXT: eor v3.16b, v16.16b, v17.16b
-; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEON-NEXT: eor v0.16b, v1.16b, v0.16b
-; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #31
+; CHECK-NEON-NEXT: rev32 v1.8b, v1.8b
+; CHECK-NEON-NEXT: rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT: rbit v1.8b, v1.8b
+; CHECK-NEON-NEXT: rbit v2.8b, v0.8b
+; CHECK-NEON-NEXT: xtn v0.4h, v1.4s
+; CHECK-NEON-NEXT: xtn v3.4h, v2.4s
+; CHECK-NEON-NEXT: shrn v16.4h, v2.4s, #16
+; CHECK-NEON-NEXT: shrn v17.4h, v1.4s, #16
+; CHECK-NEON-NEXT: xtn v20.8b, v16.8h
+; CHECK-NEON-NEXT: shrn v16.8b, v16.8h, #8
+; CHECK-NEON-NEXT: rev16 v4.8b, v0.8b
+; CHECK-NEON-NEXT: rev16 v5.8b, v3.8b
+; CHECK-NEON-NEXT: xtn v1.8b, v0.8h
+; CHECK-NEON-NEXT: xtn v21.8b, v17.8h
+; CHECK-NEON-NEXT: xtn v2.8b, v3.8h
+; CHECK-NEON-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEON-NEXT: shrn v3.8b, v3.8h, #8
+; CHECK-NEON-NEXT: shrn v17.8b, v17.8h, #8
+; CHECK-NEON-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT: rbit v5.8b, v5.8b
+; CHECK-NEON-NEXT: rbit v22.8b, v1.8b
+; CHECK-NEON-NEXT: rbit v23.8b, v21.8b
+; CHECK-NEON-NEXT: rbit v24.8b, v2.8b
+; CHECK-NEON-NEXT: pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEON-NEXT: pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEON-NEXT: pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEON-NEXT: pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEON-NEXT: xtn v6.8b, v4.8h
+; CHECK-NEON-NEXT: xtn v7.8b, v5.8h
+; CHECK-NEON-NEXT: shrn v5.8b, v5.8h, #8
+; CHECK-NEON-NEXT: shrn v4.8b, v4.8h, #8
+; CHECK-NEON-NEXT: pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEON-NEXT: rbit v18.8b, v6.8b
+; CHECK-NEON-NEXT: rbit v19.8b, v7.8b
+; CHECK-NEON-NEXT: pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEON-NEXT: pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT: rbit v7.8b, v23.8b
+; CHECK-NEON-NEXT: pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEON-NEXT: rbit v19.8b, v20.8b
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT: ushll v6.8h, v6.8b, #0
+; CHECK-NEON-NEXT: ushr v7.8b, v7.8b, #1
+; CHECK-NEON-NEXT: rbit v18.8b, v18.8b
+; CHECK-NEON-NEXT: pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEON-NEXT: ushr v5.8b, v18.8b, #1
+; CHECK-NEON-NEXT: rbit v18.8b, v19.8b
+; CHECK-NEON-NEXT: pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEON-NEXT: pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEON-NEXT: eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT: eor v5.8b, v25.8b, v16.8b
+; CHECK-NEON-NEXT: eor v16.8b, v17.8b, v19.8b
+; CHECK-NEON-NEXT: pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEON-NEXT: ushr v18.8b, v18.8b, #1
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: shll v4.8h, v4.8b, #8
+; CHECK-NEON-NEXT: eor v5.8b, v18.8b, v5.8b
+; CHECK-NEON-NEXT: pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT: pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEON-NEXT: pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEON-NEXT: orr v4.16b, v6.16b, v4.16b
+; CHECK-NEON-NEXT: rbit v6.8b, v17.8b
+; CHECK-NEON-NEXT: shll v5.8h, v5.8b, #8
+; CHECK-NEON-NEXT: shll v7.8h, v7.8b, #8
+; CHECK-NEON-NEXT: ushll v17.8h, v18.8b, #0
+; CHECK-NEON-NEXT: rev16 v4.8b, v4.8b
+; CHECK-NEON-NEXT: ushll v16.8h, v16.8b, #0
+; CHECK-NEON-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEON-NEXT: ushr v3.8b, v6.8b, #1
+; CHECK-NEON-NEXT: orr v5.16b, v17.16b, v5.16b
+; CHECK-NEON-NEXT: orr v6.16b, v16.16b, v7.16b
+; CHECK-NEON-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT: eor v0.8b, v3.8b, v0.8b
+; CHECK-NEON-NEXT: eor v2.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEON-NEXT: ushr v3.4h, v4.4h, #1
+; CHECK-NEON-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEON-NEXT: shll v1.4s, v2.4h, #16
+; CHECK-NEON-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT: rbit v0.8b, v0.8b
; CHECK-NEON-NEXT: ret
;
; CHECK-AES-LABEL: clmulr_v2i32_neon:
; CHECK-AES: // %bb.0:
+; CHECK-AES-NEXT: rev32 v1.8b, v1.8b
+; CHECK-AES-NEXT: rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT: rbit v1.8b, v1.8b
+; CHECK-AES-NEXT: rbit v0.8b, v0.8b
; CHECK-AES-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-AES-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-AES-NEXT: pmull2 v2.1q, v0.2d, v1.2d
; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d
; CHECK-AES-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-AES-NEXT: shrn v0.2s, v0.2d, #31
+; CHECK-AES-NEXT: xtn v0.2s, v0.2d
+; CHECK-AES-NEXT: rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT: rbit v0.8b, v0.8b
; CHECK-AES-NEXT: ret
%a.ext = zext <2 x i32> %a to <2 x i64>
%b.ext = zext <2 x i32> %b to <2 x i64>
@@ -5113,72 +4569,26 @@ define <8 x i16> @clmulh_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
; CHECK-LABEL: clmulh_v8i16_neon:
; CHECK: // %bb.0:
; CHECK-NEXT: rev16 v1.16b, v1.16b
-; CHECK-NEXT: rev16 v3.16b, v0.16b
-; CHECK-NEXT: movi v2.8h, #2
-; CHECK-NEXT: movi v4.8h, #1
-; CHECK-NEXT: movi v5.8h, #4
-; CHECK-NEXT: movi v6.8h, #8
-; CHECK-NEXT: movi v7.8h, #16
-; CHECK-NEXT: movi v16.8h, #32
-; CHECK-NEXT: movi v17.8h, #128
-; CHECK-NEXT: movi v18.8h, #1, lsl #8
-; CHECK-NEXT: movi v19.8h, #8, lsl #8
-; CHECK-NEXT: movi v20.8h, #16, lsl #8
-; CHECK-NEXT: rbit v0.16b, v1.16b
-; CHECK-NEXT: rbit v1.16b, v3.16b
-; CHECK-NEXT: movi v3.8h, #64
-; CHECK-NEXT: movi v21.8h, #2, lsl #8
-; CHECK-NEXT: movi v22.8h, #32, lsl #8
-; CHECK-NEXT: movi v23.8h, #4, lsl #8
-; CHECK-NEXT: movi v24.8h, #64, lsl #8
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT: and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT: and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT: and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT: and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT: and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT: and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT: mul v2.8h, v1.8h, v2.8h
-; CHECK-NEXT: mul v4.8h, v1.8h, v4.8h
-; CHECK-NEXT: mul v5.8h, v1.8h, v5.8h
-; CHECK-NEXT: mul v6.8h, v1.8h, v6.8h
-; CHECK-NEXT: mul v7.8h, v1.8h, v7.8h
-; CHECK-NEXT: mul v16.8h, v1.8h, v16.8h
-; CHECK-NEXT: and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT: mul v17.8h, v1.8h, v17.8h
-; CHECK-NEXT: mul v18.8h, v1.8h, v18.8h
-; CHECK-NEXT: and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT: mul v19.8h, v1.8h, v19.8h
-; CHECK-NEXT: mul v20.8h, v1.8h, v20.8h
-; CHECK-NEXT: and v22.16b, v0.16b, v22.16b
-; CHECK-NEXT: eor v2.16b, v4.16b, v2.16b
-; CHECK-NEXT: eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT: movi v6.8h, #128, lsl #8
-; CHECK-NEXT: mul v3.8h, v1.8h, v3.8h
-; CHECK-NEXT: mul v5.8h, v1.8h, v21.8h
-; CHECK-NEXT: and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT: and v23.16b, v0.16b, v24.16b
-; CHECK-NEXT: mul v22.8h, v1.8h, v22.8h
-; CHECK-NEXT: eor v7.16b, v7.16b, v16.16b
-; CHECK-NEXT: eor v16.16b, v17.16b, v18.16b
-; CHECK-NEXT: eor v17.16b, v19.16b, v20.16b
-; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v6.16b
-; CHECK-NEXT: mul v4.8h, v1.8h, v21.8h
-; CHECK-NEXT: mul v6.8h, v1.8h, v23.8h
-; CHECK-NEXT: eor v3.16b, v7.16b, v3.16b
-; CHECK-NEXT: eor v5.16b, v16.16b, v5.16b
-; CHECK-NEXT: eor v7.16b, v17.16b, v22.16b
-; CHECK-NEXT: mul v0.8h, v1.8h, v0.8h
-; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v2.16b, v5.16b, v4.16b
-; CHECK-NEXT: eor v3.16b, v7.16b, v6.16b
-; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: rev16 v0.16b, v0.16b
+; CHECK-NEXT: rbit v1.16b, v1.16b
+; CHECK-NEXT: rbit v0.16b, v0.16b
+; CHECK-NEXT: xtn v2.8b, v1.8h
+; CHECK-NEXT: xtn v3.8b, v0.8h
+; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-NEXT: rbit v4.8b, v2.8b
+; CHECK-NEXT: rbit v5.8b, v3.8b
+; CHECK-NEXT: pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT: pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: ushr v1.8b, v4.8b, #1
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: ushll v1.8h, v2.8b, #0
+; CHECK-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
; CHECK-NEXT: rev16 v0.16b, v0.16b
; CHECK-NEXT: rbit v0.16b, v0.16b
; CHECK-NEXT: ushr v0.8h, v0.8h, #1
@@ -5194,87 +4604,30 @@ define <8 x i16> @clmulh_v8i16_neon(<8 x i16> %a, <8 x i16> %b) nounwind {
define <4 x i16> @clmulh_v4i16_neon(<4 x i16> %a, <4 x i16> %b) nounwind {
; CHECK-LABEL: clmulh_v4i16_neon:
; CHECK: // %bb.0:
-; CHECK-NEXT: movi v2.4s, #2
-; CHECK-NEXT: movi v3.4s, #1
-; CHECK-NEXT: movi v4.4s, #4
-; CHECK-NEXT: movi v5.4s, #8
-; CHECK-NEXT: ushll v1.4s, v1.4h, #0
-; CHECK-NEXT: movi v6.4s, #16
-; CHECK-NEXT: movi v7.4s, #32
-; CHECK-NEXT: movi v16.4s, #128
-; CHECK-NEXT: movi v17.4s, #1, lsl #8
-; CHECK-NEXT: movi v18.4s, #8, lsl #8
-; CHECK-NEXT: movi v19.4s, #16, lsl #8
-; CHECK-NEXT: movi v20.4s, #64
-; CHECK-NEXT: and v2.16b, v1.16b, v2.16b
-; CHECK-NEXT: and v3.16b, v1.16b, v3.16b
-; CHECK-NEXT: and v4.16b, v1.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v1.16b, v5.16b
-; CHECK-NEXT: movi v21.4s, #2, lsl #8
-; CHECK-NEXT: movi v22.4s, #32, lsl #8
-; CHECK-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEXT: and v16.16b, v1.16b, v16.16b
-; CHECK-NEXT: and v17.16b, v1.16b, v17.16b
-; CHECK-NEXT: and v18.16b, v1.16b, v18.16b
-; CHECK-NEXT: and v19.16b, v1.16b, v19.16b
-; CHECK-NEXT: xtn v2.4h, v2.4s
-; CHECK-NEXT: xtn v3.4h, v3.4s
-; CHECK-NEXT: xtn v4.4h, v4.4s
-; CHECK-NEXT: xtn v5.4h, v5.4s
-; CHECK-NEXT: movi v23.4s, #4, lsl #8
-; CHECK-NEXT: movi v24.4s, #64, lsl #8
-; CHECK-NEXT: xtn v6.4h, v6.4s
-; CHECK-NEXT: xtn v7.4h, v7.4s
-; CHECK-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEXT: xtn v16.4h, v16.4s
-; CHECK-NEXT: xtn v17.4h, v17.4s
-; CHECK-NEXT: and v21.16b, v1.16b, v21.16b
-; CHECK-NEXT: xtn v18.4h, v18.4s
-; CHECK-NEXT: xtn v19.4h, v19.4s
-; CHECK-NEXT: and v22.16b, v1.16b, v22.16b
-; CHECK-NEXT: umull v2.4s, v0.4h, v2.4h
-; CHECK-NEXT: umull v3.4s, v0.4h, v3.4h
-; CHECK-NEXT: umull v4.4s, v0.4h, v4.4h
-; CHECK-NEXT: umull v5.4s, v0.4h, v5.4h
-; CHECK-NEXT: movi v25.4s, #128, lsl #8
-; CHECK-NEXT: xtn v20.4h, v20.4s
-; CHECK-NEXT: xtn v21.4h, v21.4s
-; CHECK-NEXT: and v23.16b, v1.16b, v23.16b
-; CHECK-NEXT: xtn v22.4h, v22.4s
-; CHECK-NEXT: and v24.16b, v1.16b, v24.16b
-; CHECK-NEXT: umull v6.4s, v0.4h, v6.4h
-; CHECK-NEXT: umull v7.4s, v0.4h, v7.4h
-; CHECK-NEXT: umull v16.4s, v0.4h, v16.4h
-; CHECK-NEXT: umull v17.4s, v0.4h, v17.4h
-; CHECK-NEXT: umull v18.4s, v0.4h, v18.4h
-; CHECK-NEXT: umull v19.4s, v0.4h, v19.4h
-; CHECK-NEXT: eor v2.16b, v3.16b, v2.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v5.16b
-; CHECK-NEXT: and v1.16b, v1.16b, v25.16b
-; CHECK-NEXT: xtn v4.4h, v23.4s
-; CHECK-NEXT: xtn v5.4h, v24.4s
-; CHECK-NEXT: umull v20.4s, v0.4h, v20.4h
-; CHECK-NEXT: umull v21.4s, v0.4h, v21.4h
-; CHECK-NEXT: umull v22.4s, v0.4h, v22.4h
-; CHECK-NEXT: eor v6.16b, v6.16b, v7.16b
-; CHECK-NEXT: eor v7.16b, v16.16b, v17.16b
-; CHECK-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v16.16b, v18.16b, v19.16b
-; CHECK-NEXT: xtn v1.4h, v1.4s
-; CHECK-NEXT: umull v3.4s, v0.4h, v4.4h
-; CHECK-NEXT: umull v4.4s, v0.4h, v5.4h
-; CHECK-NEXT: eor v5.16b, v6.16b, v20.16b
-; CHECK-NEXT: eor v6.16b, v7.16b, v21.16b
-; CHECK-NEXT: eor v7.16b, v16.16b, v22.16b
-; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
-; CHECK-NEXT: eor v1.16b, v2.16b, v5.16b
-; CHECK-NEXT: eor v2.16b, v6.16b, v3.16b
-; CHECK-NEXT: eor v3.16b, v7.16b, v4.16b
-; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
-; CHECK-NEXT: shrn v0.4h, v0.4s, #16
+; CHECK-NEXT: rev16 v1.8b, v1.8b
+; CHECK-NEXT: rev16 v0.8b, v0.8b
+; CHECK-NEXT: rbit v1.8b, v1.8b
+; CHECK-NEXT: rbit v0.8b, v0.8b
+; CHECK-NEXT: xtn v2.8b, v1.8h
+; CHECK-NEXT: xtn v3.8b, v0.8h
+; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: shrn v1.8b, v1.8h, #8
+; CHECK-NEXT: rbit v4.8b, v2.8b
+; CHECK-NEXT: rbit v5.8b, v3.8b
+; CHECK-NEXT: pmul v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: pmul v1.8b, v3.8b, v1.8b
+; CHECK-NEXT: pmul v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: pmul v4.8b, v5.8b, v4.8b
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: ushr v1.8b, v4.8b, #1
+; CHECK-NEXT: eor v0.8b, v1.8b, v0.8b
+; CHECK-NEXT: ushll v1.8h, v2.8b, #0
+; CHECK-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: rev16 v0.8b, v0.8b
+; CHECK-NEXT: rbit v0.8b, v0.8b
+; CHECK-NEXT: ushr v0.4h, v0.4h, #1
; CHECK-NEXT: ret
%a.ext = zext <4 x i16> %a to <4 x i32>
%b.ext = zext <4 x i16> %b to <4 x i32>
@@ -5288,136 +4641,87 @@ define <4 x i32> @clmulh_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
; CHECK-LABEL: clmulh_v4i32_neon:
; CHECK: // %bb.0:
; CHECK-NEXT: rev32 v1.16b, v1.16b
-; CHECK-NEXT: rev32 v2.16b, v0.16b
-; CHECK-NEXT: movi v3.4s, #2
-; CHECK-NEXT: movi v4.4s, #1
-; CHECK-NEXT: movi v5.4s, #4
-; CHECK-NEXT: movi v6.4s, #8
-; CHECK-NEXT: movi v7.4s, #16
-; CHECK-NEXT: movi v16.4s, #32
-; CHECK-NEXT: movi v17.4s, #64
-; CHECK-NEXT: movi v18.4s, #1, lsl #8
-; CHECK-NEXT: movi v19.4s, #2, lsl #8
-; CHECK-NEXT: movi v20.4s, #8, lsl #8
-; CHECK-NEXT: rbit v0.16b, v1.16b
-; CHECK-NEXT: rbit v1.16b, v2.16b
-; CHECK-NEXT: movi v2.4s, #128
-; CHECK-NEXT: movi v21.4s, #16, lsl #8
-; CHECK-NEXT: movi v22.4s, #8, lsl #16
-; CHECK-NEXT: movi v23.4s, #2, lsl #24
-; CHECK-NEXT: movi v25.4s, #4, lsl #24
-; CHECK-NEXT: movi v24.4s, #32, lsl #16
-; CHECK-NEXT: movi v26.4s, #8, lsl #24
-; CHECK-NEXT: and v3.16b, v0.16b, v3.16b
-; CHECK-NEXT: and v4.16b, v0.16b, v4.16b
-; CHECK-NEXT: and v5.16b, v0.16b, v5.16b
-; CHECK-NEXT: and v6.16b, v0.16b, v6.16b
-; CHECK-NEXT: and v7.16b, v0.16b, v7.16b
-; CHECK-NEXT: and v16.16b, v0.16b, v16.16b
-; CHECK-NEXT: and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT: and v2.16b, v0.16b, v2.16b
-; CHECK-NEXT: and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT: mul v5.4s, v1.4s, v5.4s
-; CHECK-NEXT: mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT: mul v2.4s, v1.4s, v2.4s
-; CHECK-NEXT: and v23.16b, v0.16b, v23.16b
-; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b
-; CHECK-NEXT: eor v4.16b, v5.16b, v6.16b
-; CHECK-NEXT: eor v6.16b, v7.16b, v16.16b
-; CHECK-NEXT: mul v5.4s, v1.4s, v18.4s
-; CHECK-NEXT: and v7.16b, v0.16b, v19.16b
-; CHECK-NEXT: movi v18.4s, #32, lsl #8
-; CHECK-NEXT: and v16.16b, v0.16b, v20.16b
-; CHECK-NEXT: movi v19.4s, #1, lsl #16
-; CHECK-NEXT: movi v20.4s, #4, lsl #8
-; CHECK-NEXT: eor v3.16b, v3.16b, v4.16b
-; CHECK-NEXT: and v4.16b, v0.16b, v21.16b
-; CHECK-NEXT: eor v6.16b, v6.16b, v17.16b
-; CHECK-NEXT: movi v17.4s, #2, lsl #16
-; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT: eor v5.16b, v2.16b, v5.16b
-; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT: and v18.16b, v0.16b, v18.16b
-; CHECK-NEXT: movi v21.4s, #64, lsl #8
-; CHECK-NEXT: mul v4.4s, v1.4s, v4.4s
-; CHECK-NEXT: eor v2.16b, v3.16b, v6.16b
-; CHECK-NEXT: and v3.16b, v0.16b, v19.16b
-; CHECK-NEXT: movi v19.4s, #128, lsl #16
-; CHECK-NEXT: and v20.16b, v0.16b, v20.16b
-; CHECK-NEXT: and v6.16b, v0.16b, v17.16b
-; CHECK-NEXT: movi v17.4s, #64, lsl #16
-; CHECK-NEXT: eor v5.16b, v5.16b, v7.16b
-; CHECK-NEXT: mul v7.4s, v1.4s, v18.4s
-; CHECK-NEXT: movi v18.4s, #4, lsl #16
-; CHECK-NEXT: mul v3.4s, v1.4s, v3.4s
-; CHECK-NEXT: eor v4.16b, v16.16b, v4.16b
-; CHECK-NEXT: and v16.16b, v0.16b, v21.16b
-; CHECK-NEXT: movi v21.4s, #1, lsl #24
-; CHECK-NEXT: and v19.16b, v0.16b, v19.16b
-; CHECK-NEXT: mul v6.4s, v1.4s, v6.4s
-; CHECK-NEXT: mul v20.4s, v1.4s, v20.4s
-; CHECK-NEXT: and v17.16b, v0.16b, v17.16b
-; CHECK-NEXT: eor v4.16b, v4.16b, v7.16b
-; CHECK-NEXT: and v7.16b, v0.16b, v18.16b
-; CHECK-NEXT: mul v16.4s, v1.4s, v16.4s
-; CHECK-NEXT: mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT: and v21.16b, v0.16b, v21.16b
-; CHECK-NEXT: movi v18.4s, #128, lsl #8
-; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT: eor v5.16b, v5.16b, v20.16b
-; CHECK-NEXT: mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT: and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT: movi v22.4s, #16, lsl #16
-; CHECK-NEXT: mul v21.4s, v1.4s, v21.4s
-; CHECK-NEXT: eor v4.16b, v4.16b, v16.16b
-; CHECK-NEXT: and v20.16b, v0.16b, v24.16b
-; CHECK-NEXT: movi v24.4s, #64, lsl #24
-; CHECK-NEXT: eor v2.16b, v2.16b, v5.16b
-; CHECK-NEXT: eor v16.16b, v17.16b, v19.16b
-; CHECK-NEXT: and v17.16b, v0.16b, v18.16b
-; CHECK-NEXT: mul v18.4s, v1.4s, v23.4s
-; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT: mul v6.4s, v1.4s, v7.4s
-; CHECK-NEXT: and v7.16b, v0.16b, v22.16b
-; CHECK-NEXT: and v19.16b, v0.16b, v25.16b
-; CHECK-NEXT: movi v22.4s, #16, lsl #24
-; CHECK-NEXT: movi v23.4s, #32, lsl #24
-; CHECK-NEXT: eor v16.16b, v16.16b, v21.16b
-; CHECK-NEXT: and v21.16b, v0.16b, v26.16b
-; CHECK-NEXT: mul v17.4s, v1.4s, v17.4s
-; CHECK-NEXT: mul v7.4s, v1.4s, v7.4s
-; CHECK-NEXT: mul v5.4s, v1.4s, v20.4s
-; CHECK-NEXT: mul v19.4s, v1.4s, v19.4s
-; CHECK-NEXT: eor v3.16b, v3.16b, v6.16b
-; CHECK-NEXT: eor v6.16b, v16.16b, v18.16b
-; CHECK-NEXT: movi v16.4s, #128, lsl #24
-; CHECK-NEXT: mul v18.4s, v1.4s, v21.4s
-; CHECK-NEXT: and v20.16b, v0.16b, v22.16b
-; CHECK-NEXT: and v21.16b, v0.16b, v23.16b
-; CHECK-NEXT: and v22.16b, v0.16b, v24.16b
-; CHECK-NEXT: eor v4.16b, v4.16b, v17.16b
-; CHECK-NEXT: eor v3.16b, v3.16b, v7.16b
-; CHECK-NEXT: eor v6.16b, v6.16b, v19.16b
-; CHECK-NEXT: and v0.16b, v0.16b, v16.16b
-; CHECK-NEXT: mul v7.4s, v1.4s, v20.4s
-; CHECK-NEXT: mul v16.4s, v1.4s, v21.4s
-; CHECK-NEXT: mul v17.4s, v1.4s, v22.4s
-; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b
-; CHECK-NEXT: eor v3.16b, v3.16b, v5.16b
-; CHECK-NEXT: eor v4.16b, v6.16b, v18.16b
-; CHECK-NEXT: mul v0.4s, v1.4s, v0.4s
-; CHECK-NEXT: eor v1.16b, v2.16b, v3.16b
-; CHECK-NEXT: eor v2.16b, v4.16b, v7.16b
-; CHECK-NEXT: eor v3.16b, v16.16b, v17.16b
-; CHECK-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEXT: eor v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: rev32 v0.16b, v0.16b
+; CHECK-NEXT: rbit v1.16b, v1.16b
+; CHECK-NEXT: rbit v2.16b, v0.16b
+; CHECK-NEXT: xtn v0.4h, v1.4s
+; CHECK-NEXT: xtn v3.4h, v2.4s
+; CHECK-NEXT: shrn v16.4h, v2.4s, #16
+; CHECK-NEXT: shrn v17.4h, v1.4s, #16
+; CHECK-NEXT: xtn v20.8b, v16.8h
+; CHECK-NEXT: shrn v16.8b, v16.8h, #8
+; CHECK-NEXT: rev16 v4.8b, v0.8b
+; CHECK-NEXT: rev16 v5.8b, v3.8b
+; CHECK-NEXT: xtn v1.8b, v0.8h
+; CHECK-NEXT: xtn v21.8b, v17.8h
+; CHECK-NEXT: xtn v2.8b, v3.8h
+; CHECK-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEXT: shrn v3.8b, v3.8h, #8
+; CHECK-NEXT: shrn v17.8b, v17.8h, #8
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: rbit v5.8b, v5.8b
+; CHECK-NEXT: rbit v22.8b, v1.8b
+; CHECK-NEXT: rbit v23.8b, v21.8b
+; CHECK-NEXT: rbit v24.8b, v2.8b
+; CHECK-NEXT: pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEXT: pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEXT: pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEXT: pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEXT: xtn v6.8b, v4.8h
+; CHECK-NEXT: xtn v7.8b, v5.8h
+; CHECK-NEXT: shrn v5.8b, v5.8h, #8
+; CHECK-NEXT: shrn v4.8b, v4.8h, #8
+; CHECK-NEXT: pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEXT: rbit v18.8b, v6.8b
+; CHECK-NEXT: rbit v19.8b, v7.8b
+; CHECK-NEXT: pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEXT: pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEXT: pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEXT: rbit v7.8b, v23.8b
+; CHECK-NEXT: pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEXT: rbit v19.8b, v20.8b
+; CHECK-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEXT: ushll v6.8h, v6.8b, #0
+; CHECK-NEXT: ushr v7.8b, v7.8b, #1
+; CHECK-NEXT: rbit v18.8b, v18.8b
+; CHECK-NEXT: pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEXT: ushr v5.8b, v18.8b, #1
+; CHECK-NEXT: rbit v18.8b, v19.8b
+; CHECK-NEXT: pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEXT: pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEXT: eor v4.8b, v5.8b, v4.8b
+; CHECK-NEXT: eor v5.8b, v25.8b, v16.8b
+; CHECK-NEXT: eor v16.8b, v17.8b, v19.8b
+; CHECK-NEXT: pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEXT: ushr v18.8b, v18.8b, #1
+; CHECK-NEXT: eor v0.8b, v0.8b, v3.8b
+; CHECK-NEXT: shll v4.8h, v4.8b, #8
+; CHECK-NEXT: eor v5.8b, v18.8b, v5.8b
+; CHECK-NEXT: pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEXT: pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEXT: pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEXT: orr v4.16b, v6.16b, v4.16b
+; CHECK-NEXT: rbit v6.8b, v17.8b
+; CHECK-NEXT: shll v5.8h, v5.8b, #8
+; CHECK-NEXT: shll v7.8h, v7.8b, #8
+; CHECK-NEXT: ushll v17.8h, v18.8b, #0
+; CHECK-NEXT: rev16 v4.8b, v4.8b
+; CHECK-NEXT: ushll v16.8h, v16.8b, #0
+; CHECK-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEXT: ushr v3.8b, v6.8b, #1
+; CHECK-NEXT: orr v5.16b, v17.16b, v5.16b
+; CHECK-NEXT: orr v6.16b, v16.16b, v7.16b
+; CHECK-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEXT: eor v0.8b, v3.8b, v0.8b
+; CHECK-NEXT: eor v2.8b, v6.8b, v5.8b
+; CHECK-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEXT: ushr v3.4h, v4.4h, #1
+; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEXT: shll v1.4s, v2.4h, #16
+; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: rev32 v0.16b, v0.16b
; CHECK-NEXT: rbit v0.16b, v0.16b
; CHECK-NEXT: ushr v0.4s, v0.4s, #1
@@ -5433,209 +4737,108 @@ define <4 x i32> @clmulh_v4i32_neon(<4 x i32> %a, <4 x i32> %b) nounwind {
define <2 x i32> @clmulh_v2i32_neon(<2 x i32> %a, <2 x i32> %b) nounwind {
; CHECK-NEON-LABEL: clmulh_v2i32_neon:
; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: mov w8, #2 // =0x2
-; CHECK-NEON-NEXT: mov w9, #1 // =0x1
-; CHECK-NEON-NEXT: ushll v1.2d, v1.2s, #0
-; CHECK-NEON-NEXT: dup v2.2d, x8
-; CHECK-NEON-NEXT: dup v3.2d, x9
-; CHECK-NEON-NEXT: mov w8, #4 // =0x4
-; CHECK-NEON-NEXT: mov w9, #8 // =0x8
-; CHECK-NEON-NEXT: dup v4.2d, x8
-; CHECK-NEON-NEXT: mov w8, #16 // =0x10
-; CHECK-NEON-NEXT: dup v5.2d, x9
-; CHECK-NEON-NEXT: dup v6.2d, x8
-; CHECK-NEON-NEXT: mov w8, #32 // =0x20
-; CHECK-NEON-NEXT: and v2.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT: and v3.16b, v1.16b, v3.16b
-; CHECK-NEON-NEXT: dup v7.2d, x8
-; CHECK-NEON-NEXT: and v4.16b, v1.16b, v4.16b
-; CHECK-NEON-NEXT: mov w8, #64 // =0x40
-; CHECK-NEON-NEXT: mov w9, #512 // =0x200
-; CHECK-NEON-NEXT: and v5.16b, v1.16b, v5.16b
-; CHECK-NEON-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT: dup v16.2d, x8
-; CHECK-NEON-NEXT: xtn v2.2s, v2.2d
-; CHECK-NEON-NEXT: xtn v3.2s, v3.2d
-; CHECK-NEON-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT: xtn v4.2s, v4.2d
-; CHECK-NEON-NEXT: mov w8, #128 // =0x80
-; CHECK-NEON-NEXT: xtn v5.2s, v5.2d
-; CHECK-NEON-NEXT: xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT: dup v17.2d, x8
-; CHECK-NEON-NEXT: xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT: mov w8, #256 // =0x100
-; CHECK-NEON-NEXT: and v16.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT: umull v2.2d, v0.2s, v2.2s
-; CHECK-NEON-NEXT: umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT: dup v18.2d, x8
-; CHECK-NEON-NEXT: umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT: mov w8, #2048 // =0x800
-; CHECK-NEON-NEXT: and v17.16b, v1.16b, v17.16b
-; CHECK-NEON-NEXT: umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT: umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT: umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT: eor v2.16b, v3.16b, v2.16b
-; CHECK-NEON-NEXT: xtn v17.2s, v17.2d
-; CHECK-NEON-NEXT: eor v3.16b, v4.16b, v5.16b
-; CHECK-NEON-NEXT: xtn v4.2s, v16.2d
-; CHECK-NEON-NEXT: dup v16.2d, x8
-; CHECK-NEON-NEXT: mov w8, #4096 // =0x1000
-; CHECK-NEON-NEXT: and v5.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT: dup v18.2d, x9
-; CHECK-NEON-NEXT: dup v19.2d, x8
-; CHECK-NEON-NEXT: mov w8, #8192 // =0x2000
-; CHECK-NEON-NEXT: umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT: xtn v3.2s, v5.2d
-; CHECK-NEON-NEXT: eor v5.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT: and v6.16b, v1.16b, v16.16b
-; CHECK-NEON-NEXT: and v16.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT: and v7.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT: dup v18.2d, x8
-; CHECK-NEON-NEXT: mov w8, #1024 // =0x400
-; CHECK-NEON-NEXT: umull v4.2d, v0.2s, v4.2s
-; CHECK-NEON-NEXT: dup v19.2d, x8
-; CHECK-NEON-NEXT: xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT: mov w8, #16384 // =0x4000
-; CHECK-NEON-NEXT: xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT: and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT: dup v20.2d, x8
-; CHECK-NEON-NEXT: mov w8, #65536 // =0x10000
-; CHECK-NEON-NEXT: umull v3.2d, v0.2s, v3.2s
-; CHECK-NEON-NEXT: xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT: dup v21.2d, x8
-; CHECK-NEON-NEXT: mov w8, #131072 // =0x20000
-; CHECK-NEON-NEXT: and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT: eor v4.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT: umull v5.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT: dup v6.2d, x8
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT: mov w8, #32768 // =0x8000
-; CHECK-NEON-NEXT: xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT: eor v3.16b, v17.16b, v3.16b
-; CHECK-NEON-NEXT: xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT: and v19.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT: and v6.16b, v1.16b, v6.16b
-; CHECK-NEON-NEXT: dup v21.2d, x8
-; CHECK-NEON-NEXT: mov w8, #262144 // =0x40000
-; CHECK-NEON-NEXT: umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT: xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT: dup v18.2d, x8
-; CHECK-NEON-NEXT: mov w8, #4194304 // =0x400000
-; CHECK-NEON-NEXT: xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT: xtn v6.2s, v6.2d
-; CHECK-NEON-NEXT: dup v22.2d, x8
-; CHECK-NEON-NEXT: mov w8, #8388608 // =0x800000
-; CHECK-NEON-NEXT: umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT: dup v23.2d, x8
-; CHECK-NEON-NEXT: mov w8, #524288 // =0x80000
-; CHECK-NEON-NEXT: umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT: and v18.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT: eor v3.16b, v3.16b, v7.16b
-; CHECK-NEON-NEXT: eor v5.16b, v5.16b, v16.16b
-; CHECK-NEON-NEXT: and v7.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT: umull v6.2d, v0.2s, v6.2s
-; CHECK-NEON-NEXT: and v19.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT: and v21.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT: xtn v18.2s, v18.2d
-; CHECK-NEON-NEXT: eor v4.16b, v3.16b, v17.16b
-; CHECK-NEON-NEXT: movi v23.4s, #128, lsl #24
-; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v20.16b
-; CHECK-NEON-NEXT: xtn v5.2s, v7.2d
-; CHECK-NEON-NEXT: dup v7.2d, x8
-; CHECK-NEON-NEXT: mov w8, #16777216 // =0x1000000
-; CHECK-NEON-NEXT: xtn v17.2s, v19.2d
-; CHECK-NEON-NEXT: xtn v19.2s, v21.2d
-; CHECK-NEON-NEXT: dup v20.2d, x8
-; CHECK-NEON-NEXT: mov w8, #33554432 // =0x2000000
-; CHECK-NEON-NEXT: eor v6.16b, v16.16b, v6.16b
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT: dup v18.2d, x8
-; CHECK-NEON-NEXT: mov w8, #1048576 // =0x100000
-; CHECK-NEON-NEXT: dup v21.2d, x8
-; CHECK-NEON-NEXT: mov w8, #2097152 // =0x200000
-; CHECK-NEON-NEXT: and v7.16b, v1.16b, v7.16b
-; CHECK-NEON-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT: dup v22.2d, x8
-; CHECK-NEON-NEXT: mov w8, #67108864 // =0x4000000
-; CHECK-NEON-NEXT: umull v17.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT: umull v19.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT: fneg v23.2d, v23.2d
-; CHECK-NEON-NEXT: eor v6.16b, v6.16b, v16.16b
-; CHECK-NEON-NEXT: and v16.16b, v1.16b, v18.16b
-; CHECK-NEON-NEXT: xtn v7.2s, v7.2d
-; CHECK-NEON-NEXT: xtn v18.2s, v20.2d
-; CHECK-NEON-NEXT: dup v20.2d, x8
-; CHECK-NEON-NEXT: mov w8, #134217728 // =0x8000000
-; CHECK-NEON-NEXT: and v21.16b, v1.16b, v21.16b
-; CHECK-NEON-NEXT: and v22.16b, v1.16b, v22.16b
-; CHECK-NEON-NEXT: umull v5.2d, v0.2s, v5.2s
-; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v19.16b
-; CHECK-NEON-NEXT: xtn v16.2s, v16.2d
-; CHECK-NEON-NEXT: dup v19.2d, x8
-; CHECK-NEON-NEXT: mov w8, #268435456 // =0x10000000
-; CHECK-NEON-NEXT: and v20.16b, v1.16b, v20.16b
-; CHECK-NEON-NEXT: umull v7.2d, v0.2s, v7.2s
-; CHECK-NEON-NEXT: umull v18.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT: dup v24.2d, x8
-; CHECK-NEON-NEXT: mov w8, #536870912 // =0x20000000
-; CHECK-NEON-NEXT: dup v25.2d, x8
-; CHECK-NEON-NEXT: mov w8, #1073741824 // =0x40000000
-; CHECK-NEON-NEXT: and v19.16b, v1.16b, v19.16b
-; CHECK-NEON-NEXT: xtn v21.2s, v21.2d
-; CHECK-NEON-NEXT: xtn v20.2s, v20.2d
-; CHECK-NEON-NEXT: dup v26.2d, x8
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT: eor v6.16b, v6.16b, v7.16b
-; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v4.16b
-; CHECK-NEON-NEXT: eor v17.16b, v17.16b, v18.16b
-; CHECK-NEON-NEXT: xtn v18.2s, v22.2d
-; CHECK-NEON-NEXT: xtn v19.2s, v19.2d
-; CHECK-NEON-NEXT: and v22.16b, v1.16b, v24.16b
-; CHECK-NEON-NEXT: and v24.16b, v1.16b, v25.16b
-; CHECK-NEON-NEXT: and v25.16b, v1.16b, v26.16b
-; CHECK-NEON-NEXT: umull v21.2d, v0.2s, v21.2s
-; CHECK-NEON-NEXT: umull v20.2d, v0.2s, v20.2s
-; CHECK-NEON-NEXT: and v1.16b, v1.16b, v23.16b
-; CHECK-NEON-NEXT: eor v7.16b, v17.16b, v16.16b
-; CHECK-NEON-NEXT: eor v3.16b, v3.16b, v5.16b
-; CHECK-NEON-NEXT: xtn v16.2s, v22.2d
-; CHECK-NEON-NEXT: xtn v17.2s, v24.2d
-; CHECK-NEON-NEXT: xtn v22.2s, v25.2d
-; CHECK-NEON-NEXT: umull v4.2d, v0.2s, v18.2s
-; CHECK-NEON-NEXT: umull v18.2d, v0.2s, v19.2s
-; CHECK-NEON-NEXT: xtn v1.2s, v1.2d
-; CHECK-NEON-NEXT: eor v5.16b, v6.16b, v21.16b
-; CHECK-NEON-NEXT: eor v6.16b, v7.16b, v20.16b
-; CHECK-NEON-NEXT: eor v2.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT: umull v7.2d, v0.2s, v16.2s
-; CHECK-NEON-NEXT: umull v16.2d, v0.2s, v17.2s
-; CHECK-NEON-NEXT: umull v17.2d, v0.2s, v22.2s
-; CHECK-NEON-NEXT: eor v3.16b, v5.16b, v4.16b
-; CHECK-NEON-NEXT: eor v4.16b, v6.16b, v18.16b
-; CHECK-NEON-NEXT: umull v0.2d, v0.2s, v1.2s
-; CHECK-NEON-NEXT: eor v1.16b, v2.16b, v3.16b
-; CHECK-NEON-NEXT: eor v2.16b, v4.16b, v7.16b
-; CHECK-NEON-NEXT: eor v3.16b, v16.16b, v17.16b
-; CHECK-NEON-NEXT: eor v1.16b, v1.16b, v2.16b
-; CHECK-NEON-NEXT: eor v0.16b, v3.16b, v0.16b
-; CHECK-NEON-NEXT: eor v0.16b, v1.16b, v0.16b
-; CHECK-NEON-NEXT: shrn v0.2s, v0.2d, #32
+; CHECK-NEON-NEXT: rev32 v1.8b, v1.8b
+; CHECK-NEON-NEXT: rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT: rbit v1.8b, v1.8b
+; CHECK-NEON-NEXT: rbit v2.8b, v0.8b
+; CHECK-NEON-NEXT: xtn v0.4h, v1.4s
+; CHECK-NEON-NEXT: xtn v3.4h, v2.4s
+; CHECK-NEON-NEXT: shrn v16.4h, v2.4s, #16
+; CHECK-NEON-NEXT: shrn v17.4h, v1.4s, #16
+; CHECK-NEON-NEXT: xtn v20.8b, v16.8h
+; CHECK-NEON-NEXT: shrn v16.8b, v16.8h, #8
+; CHECK-NEON-NEXT: rev16 v4.8b, v0.8b
+; CHECK-NEON-NEXT: rev16 v5.8b, v3.8b
+; CHECK-NEON-NEXT: xtn v1.8b, v0.8h
+; CHECK-NEON-NEXT: xtn v21.8b, v17.8h
+; CHECK-NEON-NEXT: xtn v2.8b, v3.8h
+; CHECK-NEON-NEXT: shrn v0.8b, v0.8h, #8
+; CHECK-NEON-NEXT: shrn v3.8b, v3.8h, #8
+; CHECK-NEON-NEXT: shrn v17.8b, v17.8h, #8
+; CHECK-NEON-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT: rbit v5.8b, v5.8b
+; CHECK-NEON-NEXT: rbit v22.8b, v1.8b
+; CHECK-NEON-NEXT: rbit v23.8b, v21.8b
+; CHECK-NEON-NEXT: rbit v24.8b, v2.8b
+; CHECK-NEON-NEXT: pmul v16.8b, v16.8b, v1.8b
+; CHECK-NEON-NEXT: pmul v25.8b, v20.8b, v0.8b
+; CHECK-NEON-NEXT: pmul v17.8b, v2.8b, v17.8b
+; CHECK-NEON-NEXT: pmul v0.8b, v2.8b, v0.8b
+; CHECK-NEON-NEXT: xtn v6.8b, v4.8h
+; CHECK-NEON-NEXT: xtn v7.8b, v5.8h
+; CHECK-NEON-NEXT: shrn v5.8b, v5.8h, #8
+; CHECK-NEON-NEXT: shrn v4.8b, v4.8h, #8
+; CHECK-NEON-NEXT: pmul v23.8b, v24.8b, v23.8b
+; CHECK-NEON-NEXT: rbit v18.8b, v6.8b
+; CHECK-NEON-NEXT: rbit v19.8b, v7.8b
+; CHECK-NEON-NEXT: pmul v5.8b, v5.8b, v6.8b
+; CHECK-NEON-NEXT: pmul v4.8b, v7.8b, v4.8b
+; CHECK-NEON-NEXT: pmul v6.8b, v7.8b, v6.8b
+; CHECK-NEON-NEXT: rbit v7.8b, v23.8b
+; CHECK-NEON-NEXT: pmul v18.8b, v19.8b, v18.8b
+; CHECK-NEON-NEXT: rbit v19.8b, v20.8b
+; CHECK-NEON-NEXT: eor v4.8b, v4.8b, v5.8b
+; CHECK-NEON-NEXT: ushll v6.8h, v6.8b, #0
+; CHECK-NEON-NEXT: ushr v7.8b, v7.8b, #1
+; CHECK-NEON-NEXT: rbit v18.8b, v18.8b
+; CHECK-NEON-NEXT: pmul v19.8b, v19.8b, v22.8b
+; CHECK-NEON-NEXT: ushr v5.8b, v18.8b, #1
+; CHECK-NEON-NEXT: rbit v18.8b, v19.8b
+; CHECK-NEON-NEXT: pmul v19.8b, v3.8b, v21.8b
+; CHECK-NEON-NEXT: pmul v3.8b, v3.8b, v1.8b
+; CHECK-NEON-NEXT: eor v4.8b, v5.8b, v4.8b
+; CHECK-NEON-NEXT: eor v5.8b, v25.8b, v16.8b
+; CHECK-NEON-NEXT: eor v16.8b, v17.8b, v19.8b
+; CHECK-NEON-NEXT: pmul v17.8b, v24.8b, v22.8b
+; CHECK-NEON-NEXT: ushr v18.8b, v18.8b, #1
+; CHECK-NEON-NEXT: eor v0.8b, v0.8b, v3.8b
+; CHECK-NEON-NEXT: shll v4.8h, v4.8b, #8
+; CHECK-NEON-NEXT: eor v5.8b, v18.8b, v5.8b
+; CHECK-NEON-NEXT: pmul v18.8b, v20.8b, v1.8b
+; CHECK-NEON-NEXT: eor v7.8b, v7.8b, v16.8b
+; CHECK-NEON-NEXT: pmul v16.8b, v2.8b, v21.8b
+; CHECK-NEON-NEXT: pmul v1.8b, v2.8b, v1.8b
+; CHECK-NEON-NEXT: orr v4.16b, v6.16b, v4.16b
+; CHECK-NEON-NEXT: rbit v6.8b, v17.8b
+; CHECK-NEON-NEXT: shll v5.8h, v5.8b, #8
+; CHECK-NEON-NEXT: shll v7.8h, v7.8b, #8
+; CHECK-NEON-NEXT: ushll v17.8h, v18.8b, #0
+; CHECK-NEON-NEXT: rev16 v4.8b, v4.8b
+; CHECK-NEON-NEXT: ushll v16.8h, v16.8b, #0
+; CHECK-NEON-NEXT: ushll v1.8h, v1.8b, #0
+; CHECK-NEON-NEXT: ushr v3.8b, v6.8b, #1
+; CHECK-NEON-NEXT: orr v5.16b, v17.16b, v5.16b
+; CHECK-NEON-NEXT: orr v6.16b, v16.16b, v7.16b
+; CHECK-NEON-NEXT: rbit v4.8b, v4.8b
+; CHECK-NEON-NEXT: eor v0.8b, v3.8b, v0.8b
+; CHECK-NEON-NEXT: eor v2.8b, v6.8b, v5.8b
+; CHECK-NEON-NEXT: shll v0.8h, v0.8b, #8
+; CHECK-NEON-NEXT: ushr v3.4h, v4.4h, #1
+; CHECK-NEON-NEXT: orr v0.16b, v1.16b, v0.16b
+; CHECK-NEON-NEXT: eor v2.8b, v3.8b, v2.8b
+; CHECK-NEON-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-NEON-NEXT: shll v1.4s, v2.4h, #16
+; CHECK-NEON-NEXT: orr v0.16b, v0.16b, v1.16b
+; CHECK-NEON-NEXT: rev32 v0.8b, v0.8b
+; CHECK-NEON-NEXT: rbit v0.8b, v0.8b
+; CHECK-NEON-NEXT: ushr v0.2s, v0.2s, #1
; CHECK-NEON-NEXT: ret
;
; CHECK-AES-LABEL: clmulh_v2i32_neon:
; CHECK-AES: // %bb.0:
+; CHECK-AES-NEXT: rev32 v1.8b, v1.8b
+; CHECK-AES-NEXT: rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT: rbit v1.8b, v1.8b
+; CHECK-AES-NEXT: rbit v0.8b, v0.8b
; CHECK-AES-NEXT: ushll v1.2d, v1.2s, #0
; CHECK-AES-NEXT: ushll v0.2d, v0.2s, #0
; CHECK-AES-NEXT: pmull2 v2.1q, v0.2d, v1.2d
; CHECK-AES-NEXT: pmull v0.1q, v0.1d, v1.1d
; CHECK-AES-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-AES-NEXT: shrn v0.2s, v0.2d, #32
+; CHECK-AES-NEXT: xtn v0.2s, v0.2d
+; CHECK-AES-NEXT: rev32 v0.8b, v0.8b
+; CHECK-AES-NEXT: rbit v0.8b, v0.8b
+; CHECK-AES-NEXT: ushr v0.2s, v0.2s, #1
; CHECK-AES-NEXT: ret
%a.ext = zext <2 x i32> %a to <2 x i64>
%b.ext = zext <2 x i32> %b to <2 x i64>
diff --git a/llvm/test/CodeGen/PowerPC/clmul-vector.ll b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
index f57dbeade4805..1bebca731bc2d 100644
--- a/llvm/test/CodeGen/PowerPC/clmul-vector.ll
+++ b/llvm/test/CodeGen/PowerPC/clmul-vector.ll
@@ -8779,7 +8779,7 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
;
; LE-LABEL: clmulh_v2i64:
; LE: # %bb.0:
-; LE-NEXT: stdu 1, -736(1)
+; LE-NEXT: stdu 1, -752(1)
; LE-NEXT: lis 4, -21846
; LE-NEXT: lis 5, 21845
; LE-NEXT: xxswapd 1, 35
@@ -8792,8 +8792,8 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: ori 5, 5, 21845
; LE-NEXT: mffprd 8, 1
; LE-NEXT: mffprd 10, 0
-; LE-NEXT: std 28, 704(1) # 8-byte Folded Spill
-; LE-NEXT: std 29, 712(1) # 8-byte Folded Spill
+; LE-NEXT: std 28, 720(1) # 8-byte Folded Spill
+; LE-NEXT: std 29, 728(1) # 8-byte Folded Spill
; LE-NEXT: ori 6, 6, 52428
; LE-NEXT: ori 7, 7, 13107
; LE-NEXT: sldi 4, 4, 32
@@ -8802,7 +8802,7 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: sldi 7, 7, 32
; LE-NEXT: sldi 11, 3, 1
; LE-NEXT: rldicl 3, 3, 63, 1
-; LE-NEXT: std 30, 720(1) # 8-byte Folded Spill
+; LE-NEXT: std 30, 736(1) # 8-byte Folded Spill
; LE-NEXT: lis 0, -3856
; LE-NEXT: oris 4, 4, 43690
; LE-NEXT: oris 5, 5, 21845
@@ -8811,48 +8811,50 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: sldi 12, 10, 1
; LE-NEXT: rldicl 10, 10, 63, 1
; LE-NEXT: oris 7, 7, 13107
-; LE-NEXT: std 27, 696(1) # 8-byte Folded Spill
+; LE-NEXT: std 27, 712(1) # 8-byte Folded Spill
; LE-NEXT: ori 28, 4, 43690
; LE-NEXT: ori 29, 5, 21845
-; LE-NEXT: std 14, 592(1) # 8-byte Folded Spill
-; LE-NEXT: std 15, 600(1) # 8-byte Folded Spill
+; LE-NEXT: std 14, 608(1) # 8-byte Folded Spill
+; LE-NEXT: std 15, 616(1) # 8-byte Folded Spill
; LE-NEXT: sldi 4, 8, 1
; LE-NEXT: rldicl 5, 8, 63, 1
-; LE-NEXT: std 16, 608(1) # 8-byte Folded Spill
-; LE-NEXT: std 17, 616(1) # 8-byte Folded Spill
+; LE-NEXT: std 16, 624(1) # 8-byte Folded Spill
+; LE-NEXT: std 17, 632(1) # 8-byte Folded Spill
; LE-NEXT: sldi 8, 9, 1
; LE-NEXT: rldicl 9, 9, 63, 1
-; LE-NEXT: std 28, 568(1) # 8-byte Folded Spill
-; LE-NEXT: std 29, 576(1) # 8-byte Folded Spill
+; LE-NEXT: std 28, 584(1) # 8-byte Folded Spill
+; LE-NEXT: std 29, 592(1) # 8-byte Folded Spill
; LE-NEXT: and 11, 11, 28
; LE-NEXT: and 3, 3, 29
-; LE-NEXT: std 18, 624(1) # 8-byte Folded Spill
-; LE-NEXT: std 19, 632(1) # 8-byte Folded Spill
+; LE-NEXT: std 18, 640(1) # 8-byte Folded Spill
+; LE-NEXT: std 19, 648(1) # 8-byte Folded Spill
; LE-NEXT: and 4, 4, 28
; LE-NEXT: and 5, 5, 29
-; LE-NEXT: std 20, 640(1) # 8-byte Folded Spill
-; LE-NEXT: std 21, 648(1) # 8-byte Folded Spill
+; LE-NEXT: std 20, 656(1) # 8-byte Folded Spill
+; LE-NEXT: std 21, 664(1) # 8-byte Folded Spill
; LE-NEXT: and 8, 8, 28
; LE-NEXT: and 9, 9, 29
-; LE-NEXT: std 22, 656(1) # 8-byte Folded Spill
-; LE-NEXT: std 23, 664(1) # 8-byte Folded Spill
+; LE-NEXT: std 22, 672(1) # 8-byte Folded Spill
+; LE-NEXT: std 23, 680(1) # 8-byte Folded Spill
; LE-NEXT: and 12, 12, 28
; LE-NEXT: and 10, 10, 29
-; LE-NEXT: std 24, 672(1) # 8-byte Folded Spill
-; LE-NEXT: std 25, 680(1) # 8-byte Folded Spill
+; LE-NEXT: std 24, 688(1) # 8-byte Folded Spill
+; LE-NEXT: std 25, 696(1) # 8-byte Folded Spill
; LE-NEXT: or 3, 3, 11
; LE-NEXT: or 4, 5, 4
-; LE-NEXT: std 26, 688(1) # 8-byte Folded Spill
-; LE-NEXT: std 31, 728(1) # 8-byte Folded Spill
+; LE-NEXT: std 26, 704(1) # 8-byte Folded Spill
+; LE-NEXT: std 31, 744(1) # 8-byte Folded Spill
; LE-NEXT: ori 5, 0, 61680
; LE-NEXT: ori 11, 30, 3855
-; LE-NEXT: std 2, 584(1) # 8-byte Folded Spill
+; LE-NEXT: std 2, 600(1) # 8-byte Folded Spill
+; LE-NEXT: vspltisw 2, 1
; LE-NEXT: ori 30, 6, 52428
; LE-NEXT: ori 0, 7, 13107
-; LE-NEXT: std 30, 552(1) # 8-byte Folded Spill
-; LE-NEXT: std 0, 560(1) # 8-byte Folded Spill
+; LE-NEXT: std 30, 568(1) # 8-byte Folded Spill
+; LE-NEXT: std 0, 576(1) # 8-byte Folded Spill
; LE-NEXT: or 6, 9, 8
; LE-NEXT: or 7, 10, 12
+; LE-NEXT: vupklsw 2, 2
; LE-NEXT: sldi 8, 3, 2
; LE-NEXT: rldicl 3, 3, 62, 2
; LE-NEXT: sldi 9, 4, 2
@@ -8876,9 +8878,9 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: or 3, 3, 8
; LE-NEXT: or 4, 4, 9
; LE-NEXT: ori 30, 5, 61680
-; LE-NEXT: std 30, 536(1) # 8-byte Folded Spill
+; LE-NEXT: std 30, 552(1) # 8-byte Folded Spill
; LE-NEXT: ori 0, 10, 3855
-; LE-NEXT: std 0, 544(1) # 8-byte Folded Spill
+; LE-NEXT: std 0, 560(1) # 8-byte Folded Spill
; LE-NEXT: or 5, 6, 11
; LE-NEXT: or 6, 7, 12
; LE-NEXT: sldi 7, 3, 4
@@ -8929,167 +8931,172 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: rlwimi 4, 11, 8, 24, 31
; LE-NEXT: or 10, 5, 4
; LE-NEXT: rlwinm 4, 3, 0, 30, 30
-; LE-NEXT: std 4, 528(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 544(1) # 8-byte Folded Spill
; LE-NEXT: rlwinm 4, 3, 0, 5, 5
-; LE-NEXT: std 4, 376(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 384(1) # 8-byte Folded Spill
; LE-NEXT: rlwinm 4, 3, 0, 4, 4
-; LE-NEXT: std 4, 368(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 376(1) # 8-byte Folded Spill
; LE-NEXT: rlwinm 4, 3, 0, 3, 3
-; LE-NEXT: std 4, 360(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 368(1) # 8-byte Folded Spill
; LE-NEXT: rlwinm 4, 3, 0, 2, 2
-; LE-NEXT: std 4, 352(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 360(1) # 8-byte Folded Spill
; LE-NEXT: rlwinm 4, 3, 0, 1, 1
-; LE-NEXT: std 4, 344(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 352(1) # 8-byte Folded Spill
; LE-NEXT: rlwinm 4, 3, 0, 0, 0
-; LE-NEXT: std 4, 336(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 344(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 32, 32
-; LE-NEXT: std 4, 272(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 336(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 31, 33
-; LE-NEXT: std 4, 264(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 280(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 30, 34
-; LE-NEXT: std 4, 256(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 272(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 29, 35
-; LE-NEXT: std 4, 248(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 264(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 28, 36
-; LE-NEXT: std 4, 240(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 256(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 27, 37
-; LE-NEXT: std 4, 232(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 248(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 26, 38
-; LE-NEXT: std 4, 224(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 240(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 25, 39
-; LE-NEXT: std 4, 216(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 232(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 24, 40
; LE-NEXT: rldicl 0, 6, 32, 32
; LE-NEXT: rotlwi 30, 6, 24
; LE-NEXT: rotlwi 27, 0, 24
-; LE-NEXT: std 4, 208(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 224(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 23, 41
; LE-NEXT: rlwimi 30, 6, 8, 8, 15
; LE-NEXT: rlwimi 30, 6, 8, 24, 31
; LE-NEXT: rlwimi 27, 0, 8, 8, 15
-; LE-NEXT: std 4, 200(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 216(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 22, 42
; LE-NEXT: sldi 6, 30, 32
; LE-NEXT: rlwimi 27, 0, 8, 24, 31
; LE-NEXT: or 11, 6, 27
-; LE-NEXT: std 4, 192(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 208(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 21, 43
; LE-NEXT: clrldi 5, 3, 63
; LE-NEXT: rlwinm 6, 3, 0, 29, 29
; LE-NEXT: rlwinm 7, 3, 0, 28, 28
-; LE-NEXT: std 4, 184(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 200(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 20, 44
; LE-NEXT: rlwinm 8, 3, 0, 27, 27
; LE-NEXT: rlwinm 12, 3, 0, 26, 26
; LE-NEXT: rlwinm 0, 3, 0, 25, 25
-; LE-NEXT: std 4, 176(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 192(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 19, 45
; LE-NEXT: rlwinm 30, 3, 0, 24, 24
; LE-NEXT: rlwinm 29, 3, 0, 23, 23
; LE-NEXT: rlwinm 28, 3, 0, 22, 22
-; LE-NEXT: std 4, 168(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 184(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 18, 46
; LE-NEXT: rlwinm 27, 3, 0, 21, 21
; LE-NEXT: rlwinm 26, 3, 0, 20, 20
; LE-NEXT: rlwinm 25, 3, 0, 19, 19
-; LE-NEXT: std 4, 160(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 176(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 17, 47
; LE-NEXT: rlwinm 24, 3, 0, 18, 18
; LE-NEXT: rlwinm 23, 3, 0, 17, 17
; LE-NEXT: rlwinm 22, 3, 0, 16, 16
-; LE-NEXT: std 4, 152(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 168(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 16, 48
; LE-NEXT: rlwinm 21, 3, 0, 15, 15
; LE-NEXT: rlwinm 20, 3, 0, 14, 14
; LE-NEXT: rlwinm 19, 3, 0, 13, 13
-; LE-NEXT: std 4, 144(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 160(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 15, 49
; LE-NEXT: rlwinm 18, 3, 0, 12, 12
; LE-NEXT: rlwinm 17, 3, 0, 11, 11
; LE-NEXT: rlwinm 16, 3, 0, 10, 10
-; LE-NEXT: std 4, 136(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 152(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 14, 50
; LE-NEXT: rlwinm 15, 3, 0, 9, 9
; LE-NEXT: rlwinm 14, 3, 0, 8, 8
; LE-NEXT: rlwinm 31, 3, 0, 7, 7
-; LE-NEXT: std 4, 128(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 144(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 13, 51
; LE-NEXT: rlwinm 2, 3, 0, 6, 6
-; LE-NEXT: std 4, 120(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 136(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 12, 52
-; LE-NEXT: std 4, 112(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 128(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 11, 53
-; LE-NEXT: std 4, 104(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 120(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 10, 54
-; LE-NEXT: std 4, 96(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 112(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 9, 55
-; LE-NEXT: std 4, 88(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 104(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 8, 56
-; LE-NEXT: std 4, 80(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 96(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 7, 57
-; LE-NEXT: std 4, 72(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 88(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 6, 58
-; LE-NEXT: std 4, 64(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 80(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 5, 59
-; LE-NEXT: std 4, 56(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 72(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 4, 60
-; LE-NEXT: std 4, 48(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 64(1) # 8-byte Folded Spill
; LE-NEXT: rldicl 4, 3, 3, 61
-; LE-NEXT: rldicl 3, 3, 2, 62
-; LE-NEXT: std 3, 32(1) # 8-byte Folded Spill
-; LE-NEXT: ld 3, 528(1) # 8-byte Folded Reload
-; LE-NEXT: std 4, 40(1) # 8-byte Folded Spill
+; LE-NEXT: std 4, 56(1) # 8-byte Folded Spill
+; LE-NEXT: rldicl 4, 3, 2, 62
+; LE-NEXT: rldicr 3, 3, 0, 0
+; LE-NEXT: std 3, 40(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 544(1) # 8-byte Folded Reload
+; LE-NEXT: std 4, 48(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 3
-; LE-NEXT: std 3, 288(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 296(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 5
-; LE-NEXT: std 3, 280(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 288(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 6
-; LE-NEXT: std 3, 296(1) # 8-byte Folded Spill
-; LE-NEXT: mulld 3, 11, 7
; LE-NEXT: std 3, 304(1) # 8-byte Folded Spill
-; LE-NEXT: mulld 3, 11, 8
+; LE-NEXT: mulld 3, 11, 7
; LE-NEXT: std 3, 312(1) # 8-byte Folded Spill
-; LE-NEXT: mulld 3, 11, 12
+; LE-NEXT: mulld 3, 11, 8
; LE-NEXT: std 3, 320(1) # 8-byte Folded Spill
-; LE-NEXT: mulld 3, 11, 0
+; LE-NEXT: mulld 3, 11, 12
; LE-NEXT: std 3, 328(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 0
+; LE-NEXT: std 3, 544(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 30
-; LE-NEXT: std 3, 528(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 536(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 29
-; LE-NEXT: std 3, 520(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 528(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 28
-; LE-NEXT: std 3, 512(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 520(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 27
-; LE-NEXT: std 3, 504(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 512(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 26
-; LE-NEXT: std 3, 496(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 504(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 25
-; LE-NEXT: std 3, 488(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 496(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 24
-; LE-NEXT: std 3, 480(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 488(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 23
-; LE-NEXT: std 3, 472(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 480(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 22
-; LE-NEXT: std 3, 464(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 472(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 21
-; LE-NEXT: std 3, 456(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 464(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 20
-; LE-NEXT: std 3, 448(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 456(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 19
-; LE-NEXT: std 3, 440(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 448(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 18
-; LE-NEXT: std 3, 432(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 440(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 17
-; LE-NEXT: std 3, 424(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 432(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 16
-; LE-NEXT: std 3, 416(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 424(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 15
-; LE-NEXT: std 3, 408(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 416(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 14
-; LE-NEXT: std 3, 400(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 408(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 31
-; LE-NEXT: std 3, 392(1) # 8-byte Folded Spill
+; LE-NEXT: std 3, 400(1) # 8-byte Folded Spill
; LE-NEXT: mulld 3, 11, 2
+; LE-NEXT: std 3, 392(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 384(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
; LE-NEXT: std 3, 384(1) # 8-byte Folded Spill
; LE-NEXT: ld 3, 376(1) # 8-byte Folded Reload
; LE-NEXT: mulld 3, 11, 3
@@ -9107,109 +9114,112 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: mulld 3, 11, 3
; LE-NEXT: std 3, 344(1) # 8-byte Folded Spill
; LE-NEXT: ld 3, 336(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 3, 11, 3
-; LE-NEXT: std 3, 336(1) # 8-byte Folded Spill
-; LE-NEXT: ld 3, 272(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 4, 3, 32, 31
-; LE-NEXT: ld 3, 264(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 4, 11, 4
+; LE-NEXT: ld 3, 280(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 5, 3, 33, 30
-; LE-NEXT: ld 3, 256(1) # 8-byte Folded Reload
+; LE-NEXT: ld 3, 272(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 6, 3, 34, 29
-; LE-NEXT: ld 3, 248(1) # 8-byte Folded Reload
-; LE-NEXT: std 4, 272(1) # 8-byte Folded Spill
-; LE-NEXT: mulld 4, 11, 5
-; LE-NEXT: ld 5, 280(1) # 8-byte Folded Reload
+; LE-NEXT: ld 3, 264(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 7, 3, 35, 28
-; LE-NEXT: ld 3, 240(1) # 8-byte Folded Reload
+; LE-NEXT: ld 3, 256(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 8, 3, 36, 27
-; LE-NEXT: ld 3, 232(1) # 8-byte Folded Reload
-; LE-NEXT: std 4, 264(1) # 8-byte Folded Spill
-; LE-NEXT: mulld 4, 11, 6
-; LE-NEXT: mulld 6, 11, 7
-; LE-NEXT: mulld 7, 11, 8
+; LE-NEXT: ld 3, 248(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 12, 3, 37, 26
-; LE-NEXT: ld 3, 224(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 8, 11, 12
-; LE-NEXT: std 4, 256(1) # 8-byte Folded Spill
-; LE-NEXT: clrldi 4, 9, 63
+; LE-NEXT: ld 3, 240(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 0, 3, 38, 25
-; LE-NEXT: ld 3, 216(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 4, 10, 4
-; LE-NEXT: mulld 12, 11, 0
+; LE-NEXT: ld 3, 232(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 30, 3, 39, 24
-; LE-NEXT: ld 3, 208(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 0, 11, 30
+; LE-NEXT: ld 3, 224(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 29, 3, 40, 23
-; LE-NEXT: ld 3, 200(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 30, 11, 29
+; LE-NEXT: ld 3, 216(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 28, 3, 41, 22
-; LE-NEXT: ld 3, 192(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 29, 11, 28
+; LE-NEXT: ld 3, 208(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 27, 3, 42, 21
-; LE-NEXT: ld 3, 184(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 28, 11, 27
+; LE-NEXT: ld 3, 200(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 26, 3, 43, 20
-; LE-NEXT: ld 3, 176(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 27, 11, 26
+; LE-NEXT: ld 3, 192(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 25, 3, 44, 19
-; LE-NEXT: ld 3, 168(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 26, 11, 25
+; LE-NEXT: ld 3, 184(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 24, 3, 45, 18
-; LE-NEXT: ld 3, 160(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 25, 11, 24
+; LE-NEXT: ld 3, 176(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 23, 3, 46, 17
-; LE-NEXT: ld 3, 152(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 24, 11, 23
+; LE-NEXT: ld 3, 168(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 22, 3, 47, 16
-; LE-NEXT: ld 3, 144(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 23, 11, 22
+; LE-NEXT: ld 3, 160(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 21, 3, 48, 15
-; LE-NEXT: ld 3, 136(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 22, 11, 21
+; LE-NEXT: ld 3, 152(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 20, 3, 49, 14
-; LE-NEXT: ld 3, 128(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 21, 11, 20
+; LE-NEXT: ld 3, 144(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 19, 3, 50, 13
-; LE-NEXT: ld 3, 120(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 20, 11, 19
+; LE-NEXT: ld 3, 136(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 18, 3, 51, 12
-; LE-NEXT: ld 3, 112(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 19, 11, 18
+; LE-NEXT: ld 3, 128(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 17, 3, 52, 11
-; LE-NEXT: ld 3, 104(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 18, 11, 17
+; LE-NEXT: ld 3, 120(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 16, 3, 53, 10
-; LE-NEXT: ld 3, 96(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 17, 11, 16
+; LE-NEXT: ld 3, 112(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 15, 3, 54, 9
-; LE-NEXT: ld 3, 88(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 16, 11, 15
+; LE-NEXT: ld 3, 104(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 14, 3, 55, 8
-; LE-NEXT: ld 3, 80(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 15, 11, 14
+; LE-NEXT: ld 3, 96(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 31, 3, 56, 7
-; LE-NEXT: ld 3, 72(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 14, 11, 31
+; LE-NEXT: ld 3, 88(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 2, 3, 57, 6
-; LE-NEXT: ld 3, 64(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 31, 11, 2
+; LE-NEXT: ld 3, 80(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 3, 3, 58, 5
-; LE-NEXT: std 3, 248(1) # 8-byte Folded Spill
-; LE-NEXT: ld 3, 56(1) # 8-byte Folded Reload
+; LE-NEXT: std 3, 256(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 72(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 3, 3, 59, 4
-; LE-NEXT: std 3, 240(1) # 8-byte Folded Spill
-; LE-NEXT: ld 3, 48(1) # 8-byte Folded Reload
+; LE-NEXT: std 3, 248(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 64(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 3, 3, 60, 3
-; LE-NEXT: std 3, 232(1) # 8-byte Folded Spill
-; LE-NEXT: ld 3, 40(1) # 8-byte Folded Reload
+; LE-NEXT: std 3, 240(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 56(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 3, 3, 61, 2
-; LE-NEXT: std 3, 224(1) # 8-byte Folded Spill
-; LE-NEXT: ld 3, 32(1) # 8-byte Folded Reload
+; LE-NEXT: std 3, 232(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 48(1) # 8-byte Folded Reload
; LE-NEXT: rldicl 3, 3, 62, 1
-; LE-NEXT: std 3, 216(1) # 8-byte Folded Spill
-; LE-NEXT: ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT: std 3, 224(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 40(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 336(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 4
+; LE-NEXT: clrldi 4, 9, 63
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: std 3, 280(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 5
+; LE-NEXT: ld 5, 288(1) # 8-byte Folded Reload
+; LE-NEXT: std 3, 272(1) # 8-byte Folded Spill
+; LE-NEXT: mulld 3, 11, 6
+; LE-NEXT: mulld 6, 11, 7
+; LE-NEXT: mulld 7, 11, 8
+; LE-NEXT: mulld 8, 11, 12
+; LE-NEXT: mulld 12, 11, 0
+; LE-NEXT: mulld 0, 11, 30
+; LE-NEXT: mulld 30, 11, 29
+; LE-NEXT: mulld 29, 11, 28
+; LE-NEXT: mulld 28, 11, 27
+; LE-NEXT: mulld 27, 11, 26
+; LE-NEXT: mulld 26, 11, 25
+; LE-NEXT: mulld 25, 11, 24
+; LE-NEXT: mulld 24, 11, 23
+; LE-NEXT: mulld 23, 11, 22
+; LE-NEXT: mulld 22, 11, 21
+; LE-NEXT: mulld 21, 11, 20
+; LE-NEXT: mulld 20, 11, 19
+; LE-NEXT: mulld 19, 11, 18
+; LE-NEXT: mulld 18, 11, 17
+; LE-NEXT: mulld 17, 11, 16
+; LE-NEXT: mulld 16, 11, 15
+; LE-NEXT: mulld 15, 11, 14
+; LE-NEXT: mulld 14, 11, 31
+; LE-NEXT: mulld 31, 11, 2
+; LE-NEXT: std 3, 264(1) # 8-byte Folded Spill
+; LE-NEXT: ld 3, 256(1) # 8-byte Folded Reload
; LE-NEXT: mulld 2, 11, 3
+; LE-NEXT: ld 3, 248(1) # 8-byte Folded Reload
+; LE-NEXT: mulld 3, 11, 3
+; LE-NEXT: std 3, 256(1) # 8-byte Folded Spill
; LE-NEXT: ld 3, 240(1) # 8-byte Folded Reload
; LE-NEXT: mulld 3, 11, 3
; LE-NEXT: std 3, 248(1) # 8-byte Folded Spill
@@ -9217,46 +9227,42 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: mulld 3, 11, 3
; LE-NEXT: std 3, 240(1) # 8-byte Folded Spill
; LE-NEXT: ld 3, 224(1) # 8-byte Folded Reload
-; LE-NEXT: mulld 3, 11, 3
-; LE-NEXT: std 3, 232(1) # 8-byte Folded Spill
-; LE-NEXT: ld 3, 216(1) # 8-byte Folded Reload
; LE-NEXT: mulld 11, 11, 3
; LE-NEXT: rlwinm 3, 9, 0, 30, 30
; LE-NEXT: mulld 3, 10, 3
; LE-NEXT: xor 3, 4, 3
-; LE-NEXT: ld 4, 288(1) # 8-byte Folded Reload
+; LE-NEXT: ld 4, 296(1) # 8-byte Folded Reload
; LE-NEXT: xor 4, 5, 4
; LE-NEXT: rlwinm 5, 9, 0, 29, 29
; LE-NEXT: mulld 5, 10, 5
; LE-NEXT: xor 3, 3, 5
-; LE-NEXT: ld 5, 296(1) # 8-byte Folded Reload
-; LE-NEXT: xor 4, 4, 5
-; LE-NEXT: rlwinm 5, 9, 0, 28, 28
-; LE-NEXT: mulld 5, 10, 5
-; LE-NEXT: xor 3, 3, 5
; LE-NEXT: ld 5, 304(1) # 8-byte Folded Reload
; LE-NEXT: xor 4, 4, 5
-; LE-NEXT: rlwinm 5, 9, 0, 27, 27
+; LE-NEXT: rlwinm 5, 9, 0, 28, 28
; LE-NEXT: mulld 5, 10, 5
; LE-NEXT: xor 3, 3, 5
; LE-NEXT: ld 5, 312(1) # 8-byte Folded Reload
; LE-NEXT: xor 4, 4, 5
-; LE-NEXT: rlwinm 5, 9, 0, 26, 26
+; LE-NEXT: rlwinm 5, 9, 0, 27, 27
; LE-NEXT: mulld 5, 10, 5
; LE-NEXT: xor 3, 3, 5
; LE-NEXT: ld 5, 320(1) # 8-byte Folded Reload
; LE-NEXT: xor 4, 4, 5
-; LE-NEXT: rlwinm 5, 9, 0, 25, 25
+; LE-NEXT: rlwinm 5, 9, 0, 26, 26
; LE-NEXT: mulld 5, 10, 5
; LE-NEXT: xor 3, 3, 5
; LE-NEXT: ld 5, 328(1) # 8-byte Folded Reload
; LE-NEXT: xor 4, 4, 5
-; LE-NEXT: rlwinm 5, 9, 0, 24, 24
+; LE-NEXT: rlwinm 5, 9, 0, 25, 25
; LE-NEXT: mulld 5, 10, 5
; LE-NEXT: xor 3, 3, 5
; LE-NEXT: std 3, 328(1) # 8-byte Folded Spill
-; LE-NEXT: ld 3, 528(1) # 8-byte Folded Reload
+; LE-NEXT: ld 3, 544(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 4, 3
+; LE-NEXT: ld 4, 536(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 528(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
; LE-NEXT: ld 4, 520(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 4
; LE-NEXT: ld 4, 512(1) # 8-byte Folded Reload
@@ -9303,70 +9309,70 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: xor 3, 3, 4
; LE-NEXT: ld 4, 344(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 4
-; LE-NEXT: ld 4, 336(1) # 8-byte Folded Reload
+; LE-NEXT: ld 4, 280(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 4
; LE-NEXT: ld 4, 272(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 4
; LE-NEXT: ld 4, 264(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 4
; LE-NEXT: ld 4, 256(1) # 8-byte Folded Reload
-; LE-NEXT: xor 3, 3, 4
-; LE-NEXT: ld 4, 248(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 6
-; LE-NEXT: ld 6, 576(1) # 8-byte Folded Reload
+; LE-NEXT: ld 6, 592(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 7
-; LE-NEXT: ld 7, 568(1) # 8-byte Folded Reload
+; LE-NEXT: ld 7, 584(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 8
-; LE-NEXT: ld 8, 560(1) # 8-byte Folded Reload
+; LE-NEXT: ld 8, 576(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 12
-; LE-NEXT: ld 12, 544(1) # 8-byte Folded Reload
+; LE-NEXT: ld 12, 560(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 0
-; LE-NEXT: ld 0, 536(1) # 8-byte Folded Reload
+; LE-NEXT: ld 0, 552(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 30
-; LE-NEXT: ld 30, 720(1) # 8-byte Folded Reload
+; LE-NEXT: ld 30, 736(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 29
-; LE-NEXT: ld 29, 712(1) # 8-byte Folded Reload
+; LE-NEXT: ld 29, 728(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 28
-; LE-NEXT: ld 28, 704(1) # 8-byte Folded Reload
+; LE-NEXT: ld 28, 720(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 27
-; LE-NEXT: ld 27, 696(1) # 8-byte Folded Reload
+; LE-NEXT: ld 27, 712(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 26
-; LE-NEXT: ld 26, 688(1) # 8-byte Folded Reload
+; LE-NEXT: ld 26, 704(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 25
-; LE-NEXT: ld 25, 680(1) # 8-byte Folded Reload
+; LE-NEXT: ld 25, 696(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 24
-; LE-NEXT: ld 24, 672(1) # 8-byte Folded Reload
+; LE-NEXT: ld 24, 688(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 23
-; LE-NEXT: ld 23, 664(1) # 8-byte Folded Reload
+; LE-NEXT: ld 23, 680(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 22
-; LE-NEXT: ld 22, 656(1) # 8-byte Folded Reload
+; LE-NEXT: ld 22, 672(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 21
-; LE-NEXT: ld 21, 648(1) # 8-byte Folded Reload
+; LE-NEXT: ld 21, 664(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 20
-; LE-NEXT: ld 20, 640(1) # 8-byte Folded Reload
+; LE-NEXT: ld 20, 656(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 19
-; LE-NEXT: ld 19, 632(1) # 8-byte Folded Reload
+; LE-NEXT: ld 19, 648(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 18
-; LE-NEXT: ld 18, 624(1) # 8-byte Folded Reload
+; LE-NEXT: ld 18, 640(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 17
-; LE-NEXT: ld 17, 616(1) # 8-byte Folded Reload
+; LE-NEXT: ld 17, 632(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 16
-; LE-NEXT: ld 16, 608(1) # 8-byte Folded Reload
+; LE-NEXT: ld 16, 624(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 15
-; LE-NEXT: ld 15, 600(1) # 8-byte Folded Reload
+; LE-NEXT: ld 15, 616(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 14
-; LE-NEXT: ld 14, 592(1) # 8-byte Folded Reload
+; LE-NEXT: ld 14, 608(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 31
-; LE-NEXT: ld 31, 728(1) # 8-byte Folded Reload
+; LE-NEXT: ld 31, 744(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 2
-; LE-NEXT: ld 2, 584(1) # 8-byte Folded Reload
+; LE-NEXT: ld 2, 600(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 4
-; LE-NEXT: ld 4, 240(1) # 8-byte Folded Reload
+; LE-NEXT: ld 4, 248(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 4
-; LE-NEXT: ld 4, 232(1) # 8-byte Folded Reload
+; LE-NEXT: ld 4, 240(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: ld 4, 336(1) # 8-byte Folded Reload
; LE-NEXT: xor 3, 3, 11
-; LE-NEXT: ld 11, 552(1) # 8-byte Folded Reload
+; LE-NEXT: ld 11, 568(1) # 8-byte Folded Reload
+; LE-NEXT: xor 3, 3, 4
; LE-NEXT: sldi 4, 3, 1
; LE-NEXT: rldicl 3, 3, 63, 1
; LE-NEXT: and 4, 4, 7
@@ -9392,11 +9398,13 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: sldi 4, 5, 32
; LE-NEXT: or 3, 4, 3
; LE-NEXT: ld 4, 328(1) # 8-byte Folded Reload
-; LE-NEXT: rldicl 3, 3, 63, 1
; LE-NEXT: mtfprd 0, 3
-; LE-NEXT: rlwinm 3, 9, 0, 23, 23
+; LE-NEXT: rlwinm 3, 9, 0, 24, 24
; LE-NEXT: mulld 3, 10, 3
; LE-NEXT: xor 3, 4, 3
+; LE-NEXT: rlwinm 4, 9, 0, 23, 23
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
; LE-NEXT: rlwinm 4, 9, 0, 22, 22
; LE-NEXT: mulld 4, 10, 4
; LE-NEXT: xor 3, 3, 4
@@ -9590,6 +9598,9 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: rldicl 4, 4, 62, 1
; LE-NEXT: mulld 4, 10, 4
; LE-NEXT: xor 3, 3, 4
+; LE-NEXT: rldicr 4, 9, 0, 0
+; LE-NEXT: mulld 4, 10, 4
+; LE-NEXT: xor 3, 3, 4
; LE-NEXT: sldi 4, 3, 1
; LE-NEXT: rldicl 3, 3, 63, 1
; LE-NEXT: and 4, 4, 7
@@ -9614,10 +9625,10 @@ define <2 x i64> @clmulh_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
; LE-NEXT: rlwimi 4, 3, 8, 24, 31
; LE-NEXT: sldi 3, 4, 32
; LE-NEXT: or 3, 3, 5
-; LE-NEXT: rldicl 3, 3, 63, 1
; LE-NEXT: mtfprd 1, 3
-; LE-NEXT: xxmrghd 34, 1, 0
-; LE-NEXT: addi 1, 1, 736
+; LE-NEXT: xxmrghd 35, 1, 0
+; LE-NEXT: vsrd 2, 3, 2
+; LE-NEXT: addi 1, 1, 752
; LE-NEXT: blr
%a.ext = zext <2 x i64> %a to <2 x i128>
%b.ext = zext <2 x i64> %b to <2 x i128>
diff --git a/llvm/test/CodeGen/X86/clmul-vector.ll b/llvm/test/CodeGen/X86/clmul-vector.ll
index 8f26f84c01883..8ca41b57072ed 100644
--- a/llvm/test/CodeGen/X86/clmul-vector.ll
+++ b/llvm/test/CodeGen/X86/clmul-vector.ll
@@ -434,97 +434,78 @@ define <8 x i16> @clmul_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
;
; AVX2-LABEL: clmul_v8i16:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
-; AVX2-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpxor %xmm2, %xmm3, %xmm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm3
-; AVX2-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; AVX2-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX2-NEXT: vpxor %xmm4, %xmm3, %xmm3
-; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm0, %xmm3, %xmm0
-; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT: vpclmulqdq $0, %xmm2, %xmm3, %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
+; AVX2-NEXT: vpclmulqdq $0, %xmm5, %xmm6, %xmm5
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX2-NEXT: vpclmulqdq $17, %xmm2, %xmm3, %xmm5
+; AVX2-NEXT: vmovq %xmm5, %rax
+; AVX2-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX2-NEXT: vpclmulqdq $0, %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2
+; AVX2-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
+; AVX2-NEXT: vpclmulqdq $0, %xmm4, %xmm5, %xmm4
+; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX2-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm4
+; AVX2-NEXT: vmovq %xmm4, %rax
+; AVX2-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX2-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
+; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
+; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512-LABEL: clmul_v8i16:
; AVX512: # %bb.0:
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT: vpxor %xmm2, %xmm3, %xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm4
-; AVX512-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX512-NEXT: vpternlogq {{.*#+}} xmm4 = xmm4 ^ xmm2 ^ xmm3
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 ^ xmm4 ^ xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm4
-; AVX512-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX512-NEXT: vpternlogq {{.*#+}} xmm4 = xmm4 ^ xmm3 ^ xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 ^ xmm4 ^ xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm4
-; AVX512-NEXT: vpmullw %xmm4, %xmm0, %xmm4
-; AVX512-NEXT: vpternlogq {{.*#+}} xmm4 = xmm4 ^ xmm3 ^ xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm3
-; AVX512-NEXT: vpmullw %xmm3, %xmm0, %xmm3
-; AVX512-NEXT: vpternlogq {{.*#+}} xmm3 = xmm3 ^ xmm4 ^ xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
-; AVX512-NEXT: vpmullw %xmm2, %xmm0, %xmm2
-; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq {{.*#+}} xmm0 = xmm0 ^ xmm3 ^ xmm2
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; AVX512-NEXT: vextracti128 $1, %ymm1, %xmm2
+; AVX512-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512-NEXT: vpclmulqdq $0, %xmm2, %xmm3, %xmm4
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm6 = xmm3[1,1,1,1]
+; AVX512-NEXT: vpclmulqdq $0, %xmm5, %xmm6, %xmm5
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX512-NEXT: vpclmulqdq $17, %xmm2, %xmm3, %xmm5
+; AVX512-NEXT: vmovq %xmm5, %rax
+; AVX512-NEXT: vpinsrd $2, %eax, %xmm4, %xmm4
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
+; AVX512-NEXT: vpclmulqdq $0, %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vmovq %xmm2, %rax
+; AVX512-NEXT: vpinsrd $3, %eax, %xmm4, %xmm2
+; AVX512-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm3
+; AVX512-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[1,1,1,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[1,1,1,1]
+; AVX512-NEXT: vpclmulqdq $0, %xmm4, %xmm5, %xmm4
+; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX512-NEXT: vpclmulqdq $17, %xmm1, %xmm0, %xmm4
+; AVX512-NEXT: vmovq %xmm4, %rax
+; AVX512-NEXT: vpinsrd $2, %eax, %xmm3, %xmm3
+; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
+; AVX512-NEXT: vpclmulqdq $0, %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovq %xmm0, %rax
+; AVX512-NEXT: vpinsrd $3, %eax, %xmm3, %xmm0
+; AVX512-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
+; AVX512-NEXT: vpmovdw %ymm0, %xmm0
+; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%res = call <8 x i16> @llvm.clmul.v8i16(<8 x i16> %a, <8 x i16> %b)
ret <8 x i16> %res
More information about the llvm-commits
mailing list