[llvm] 9e5e267 - [ISel] Introduce llvm.clmul intrinsic (#168731)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 5 12:24:12 PST 2026
Author: Ramkumar Ramachandra
Date: 2026-01-05T20:24:06Z
New Revision: 9e5e267a03984de730aa6e8741c4b3bddfc88728
URL: https://github.com/llvm/llvm-project/commit/9e5e267a03984de730aa6e8741c4b3bddfc88728
DIFF: https://github.com/llvm/llvm-project/commit/9e5e267a03984de730aa6e8741c4b3bddfc88728.diff
LOG: [ISel] Introduce llvm.clmul intrinsic (#168731)
In line with a std proposal to introduce the llvm.clmul family of
intrinsics corresponding to carry-less multiply operations. This work
builds upon 727ee7e ([APInt] Introduce carry-less multiply primitives),
and follow-up patches will introduce custom-lowering on supported
targets, replacing target-specific clmul intrinsics.
Testing is done on the RISC-V target, which should be sufficient to
prove that the intrinsics work, since no RISC-V specific lowering has
been added.
Ref: https://isocpp.org/files/papers/P3642R3.html
Co-authored-by: Craig Topper <craig.topper at sifive.com>
Added:
llvm/test/CodeGen/RISCV/clmul.ll
llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
llvm/test/CodeGen/RISCV/rvv/fixed-vectors-clmul.ll
Modified:
llvm/docs/LangRef.rst
llvm/include/llvm/CodeGen/ISDOpcodes.h
llvm/include/llvm/CodeGen/SDPatternMatch.h
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/include/llvm/IR/Intrinsics.td
llvm/include/llvm/Target/TargetSelectionDAG.td
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
llvm/lib/CodeGen/TargetLoweringBase.cpp
Removed:
################################################################################
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 5b462b87acb0f..02932bc07f333 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -18387,8 +18387,6 @@ then the result is the size in bits of the type of ``src`` if
``is_zero_poison == 0`` and ``poison`` otherwise. For example,
``llvm.cttz(2) = 1``.
-.. _int_overflow:
-
.. _int_fshl:
'``llvm.fshl.*``' Intrinsic
@@ -18485,6 +18483,57 @@ Example:
%r = call i8 @llvm.fshr.i8(i8 15, i8 15, i8 11) ; %r = i8: 225 (0b11100001)
%r = call i8 @llvm.fshr.i8(i8 0, i8 255, i8 8) ; %r = i8: 255 (0b11111111)
+.. _int_clmul:
+
+'``llvm.clmul.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.clmul`` on any integer
+or vectors of integer elements.
+
+::
+
+ declare i16 @llvm.clmul.i16(i16 %a, i16 %b)
+ declare i32 @llvm.clmul.i32(i32 %a, i32 %b)
+ declare i64 @llvm.clmul.i64(i64 %a, i64 %b)
+ declare <4 x i32> @llvm.clmul.v4i32(<4 x i32> %a, <4 x i32> %b)
+
+Overview:
+"""""""""
+
+The '``llvm.clmul``' family of intrinsic functions performs carry-less
+multiplication, or XOR multiplication, on the two arguments, and returns
+the low-bits.
+
+Arguments:
+""""""""""
+
+The arguments may be any integer type or vector of integer type. Both
+arguments and result must have the same type.
+
+Semantics:
+""""""""""
+
+The '``llvm.clmul``' intrinsic computes carry-less multiply of its arguments,
+which is the result of applying the standard multiplication algorithm, where
+all of the additions are replaced with XORs, and returns the low-bits.
+The vector variants operate lane-wise.
+
+Example:
+""""""""
+
+.. code-block:: llvm
+
+ %r = call i4 @llvm.clmul.i4(i4 1, i4 2) ; %r = 2
+ %r = call i4 @llvm.clmul.i4(i4 5, i4 6) ; %r = 14
+ %r = call i4 @llvm.clmul.i4(i4 -4, i4 2) ; %r = -8
+ %r = call i4 @llvm.clmul.i4(i4 -4, i4 -5) ; %r = 4
+
+.. _int_overflow:
+
Arithmetic with Overflow Intrinsics
-----------------------------------
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index c16a1018e118f..4bd4089c081fd 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -769,6 +769,11 @@ enum NodeType {
FSHL,
FSHR,
+ /// Carry-less multiplication operations.
+ CLMUL,
+ CLMULR,
+ CLMULH,
+
/// Byte Swap and Counting operators.
BSWAP,
CTTZ,
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 026ee035fcf54..4316ab3335b69 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -921,6 +921,11 @@ inline BinaryOpc_match<LHS, RHS> m_Rotr(const LHS &L, const RHS &R) {
return BinaryOpc_match<LHS, RHS>(ISD::ROTR, L, R);
}
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS, true> m_Clmul(const LHS &L, const RHS &R) {
+ return BinaryOpc_match<LHS, RHS, true>(ISD::CLMUL, L, R);
+}
+
template <typename LHS, typename RHS>
inline BinaryOpc_match<LHS, RHS, true> m_FAdd(const LHS &L, const RHS &R) {
return BinaryOpc_match<LHS, RHS, true>(ISD::FADD, L, R);
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 8ad64a852b74d..8c01c58a0318f 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5488,6 +5488,11 @@ class LLVM_ABI TargetLowering : public TargetLoweringBase {
/// \returns The expansion if successful, SDValue() otherwise
SDValue expandFunnelShift(SDNode *N, SelectionDAG &DAG) const;
+ /// Expand carryless multiply.
+ /// \param N Node to expand
+ /// \returns The expansion if successful, SDValue() otherwise
+ SDValue expandCLMUL(SDNode *N, SelectionDAG &DAG) const;
+
/// Expand rotations.
/// \param N Node to expand
/// \param AllowVectorOps expand vector rotate, this should only be performed
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 16d7c2f4cd930..7eae4fd200500 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1478,6 +1478,8 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrNoCreateUndefOrPoison] in
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
def int_fshr : DefaultAttrsIntrinsic<[llvm_anyint_ty],
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>]>;
+ def int_clmul : DefaultAttrsIntrinsic<[llvm_anyint_ty],
+ [LLVMMatchType<0>, LLVMMatchType<0>]>;
}
let IntrProperties = [IntrNoMem, IntrSpeculatable,
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index 46cbde939e58c..68792a53907c0 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -446,6 +446,10 @@ def sra_parts : SDNode<"ISD::SRA_PARTS" , SDTIntShiftPairOp>;
def srl_parts : SDNode<"ISD::SRL_PARTS" , SDTIntShiftPairOp>;
def fshl : SDNode<"ISD::FSHL" , SDTIntShiftDOp>;
def fshr : SDNode<"ISD::FSHR" , SDTIntShiftDOp>;
+def clmul : SDNode<"ISD::CLMUL" , SDTIntBinOp,
+ [SDNPCommutative, SDNPAssociative]>;
+def clmulr : SDNode<"ISD::CLMULR" , SDTIntBinOp, [SDNPCommutative]>;
+def clmulh : SDNode<"ISD::CLMULH" , SDTIntBinOp, [SDNPCommutative]>;
def and : SDNode<"ISD::AND" , SDTIntBinOp,
[SDNPCommutative, SDNPAssociative]>;
def or : SDNode<"ISD::OR" , SDTIntBinOp,
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 5ea36e5f78b52..ee28260a74127 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -11459,6 +11459,31 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
if (SDValue AVG = foldShiftToAvg(N, DL))
return AVG;
+ SDValue Y;
+ if (VT.getScalarSizeInBits() % 2 == 0 && N1C) {
+ // Fold clmul(zext(x), zext(y)) >> (BW - 1 | BW) -> clmul(r|h)(x, y).
+ unsigned HalfBW = VT.getScalarSizeInBits() / 2;
+ if (sd_match(N0, m_Clmul(m_ZExt(m_Value(X)), m_ZExt(m_Value(Y)))) &&
+ X.getScalarValueSizeInBits() == HalfBW &&
+ Y.getScalarValueSizeInBits() == HalfBW) {
+ if (N1C->getZExtValue() == HalfBW - 1)
+ return DAG.getNode(
+ ISD::ZERO_EXTEND, DL, VT,
+ DAG.getNode(ISD::CLMULR, DL, X.getValueType(), X, Y));
+ if (N1C->getZExtValue() == HalfBW)
+ return DAG.getNode(
+ ISD::ZERO_EXTEND, DL, VT,
+ DAG.getNode(ISD::CLMULH, DL, X.getValueType(), X, Y));
+ }
+ }
+
+ // Fold bitreverse(clmul(bitreverse(x), bitreverse(y))) >> 1 ->
+ // clmulh(x, y).
+ if (N1C && N1C->getZExtValue() == 1 &&
+ sd_match(N0, m_BitReverse(m_Clmul(m_BitReverse(m_Value(X)),
+ m_BitReverse(m_Value(Y))))))
+ return DAG.getNode(ISD::CLMULH, DL, VT, X, Y);
+
return SDValue();
}
@@ -11810,6 +11835,10 @@ SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
sd_match(N0, m_Shl(m_BitReverse(m_Value(X)), m_Value(Y))))
return DAG.getNode(ISD::SRL, DL, VT, X, Y);
+ // fold bitreverse(clmul(bitreverse(x), bitreverse(y))) -> clmulr(x, y)
+ if (sd_match(N0, m_Clmul(m_BitReverse(m_Value(X)), m_BitReverse(m_Value(Y)))))
+ return DAG.getNode(ISD::CLMULR, DL, VT, X, Y);
+
return SDValue();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 6476b828448c5..70d12461581f8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4116,6 +4116,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
if (SDValue Expanded = TLI.expandROT(Node, true /*AllowVectorOps*/, DAG))
Results.push_back(Expanded);
break;
+ case ISD::CLMUL:
+ case ISD::CLMULR:
+ case ISD::CLMULH:
+ if (SDValue Expanded = TLI.expandCLMUL(Node, DAG))
+ Results.push_back(Expanded);
+ break;
case ISD::SADDSAT:
case ISD::UADDSAT:
case ISD::SSUBSAT:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 2a9edcf6e7996..c3cdaf25a6e13 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -348,6 +348,12 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
Res = PromoteIntRes_VPFunnelShift(N);
break;
+ case ISD::CLMUL:
+ case ISD::CLMULH:
+ case ISD::CLMULR:
+ Res = PromoteIntRes_CLMUL(N);
+ break;
+
case ISD::IS_FPCLASS:
Res = PromoteIntRes_IS_FPCLASS(N);
break;
@@ -1714,6 +1720,38 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VPFunnelShift(SDNode *N) {
return DAG.getNode(Opcode, DL, VT, Hi, Lo, Amt, Mask, EVL);
}
+SDValue DAGTypeLegalizer::PromoteIntRes_CLMUL(SDNode *N) {
+ unsigned Opcode = N->getOpcode();
+ SDValue X = ZExtPromotedInteger(N->getOperand(0));
+ SDValue Y = ZExtPromotedInteger(N->getOperand(1));
+
+ SDLoc DL(N);
+ EVT OldVT = N->getOperand(0).getValueType();
+ EVT VT = X.getValueType();
+
+ if (Opcode == ISD::CLMUL)
+ return DAG.getNode(ISD::CLMUL, DL, VT, X, Y);
+
+ unsigned OldBits = OldVT.getScalarSizeInBits();
+ unsigned NewBits = VT.getScalarSizeInBits();
+ if (NewBits < 2 * OldBits) {
+ SDValue Clmul = DAG.getNode(ISD::CLMUL, DL, VT, X, Y);
+ unsigned ShAmt = Opcode == ISD::CLMULH ? OldBits : OldBits - 1;
+ SDValue Lo = DAG.getNode(ISD::SRL, DL, VT, Clmul,
+ DAG.getShiftAmountConstant(ShAmt, VT, DL));
+ SDValue Clmulh = DAG.getNode(ISD::CLMULH, DL, VT, X, Y);
+ ShAmt = Opcode == ISD::CLMULH ? NewBits - OldBits : NewBits - OldBits + 1;
+ SDValue Hi = DAG.getNode(ISD::SHL, DL, VT, Clmulh,
+ DAG.getShiftAmountConstant(ShAmt, VT, DL));
+ return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
+ }
+
+ SDValue Clmul = DAG.getNode(ISD::CLMUL, DL, VT, X, Y);
+ unsigned ShAmt = Opcode == ISD::CLMULH ? OldBits : OldBits - 1;
+ return DAG.getNode(ISD::SRL, DL, VT, Clmul,
+ DAG.getShiftAmountConstant(ShAmt, VT, DL));
+}
+
SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
SDValue Res;
@@ -3175,6 +3213,12 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
ExpandIntRes_FunnelShift(N, Lo, Hi);
break;
+ case ISD::CLMUL:
+ case ISD::CLMULR:
+ case ISD::CLMULH:
+ ExpandIntRes_CLMUL(N, Lo, Hi);
+ break;
+
case ISD::VSCALE:
ExpandIntRes_VSCALE(N, Lo, Hi);
break;
@@ -5409,6 +5453,31 @@ void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, SDValue &Lo,
Hi = DAG.getNode(Opc, DL, HalfVT, Select3, Select2, NewShAmt);
}
+void DAGTypeLegalizer::ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo, SDValue &Hi) {
+ if (N->getOpcode() != ISD::CLMUL) {
+ SDValue Res = TLI.expandCLMUL(N, DAG);
+ return SplitInteger(Res, Lo, Hi);
+ }
+
+ SDValue LL, LH, RL, RH;
+ GetExpandedInteger(N->getOperand(0), LL, LH);
+ GetExpandedInteger(N->getOperand(1), RL, RH);
+ EVT HalfVT = LL.getValueType();
+ SDLoc DL(N);
+
+ // The low bits are a direct CLMUL of the the low bits.
+ Lo = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RL);
+
+ // We compute two Hi-Lo cross-products, XOR them, and XOR it with the overflow
+ // of the CLMUL of the low bits (given by CLMULH of the low bits) to yield the
+ // final high bits.
+ SDValue LoH = DAG.getNode(ISD::CLMULH, DL, HalfVT, LL, RL);
+ SDValue HiLoCross1 = DAG.getNode(ISD::CLMUL, DL, HalfVT, LL, RH);
+ SDValue HiLoCross2 = DAG.getNode(ISD::CLMUL, DL, HalfVT, LH, RL);
+ SDValue HiLoCross = DAG.getNode(ISD::XOR, DL, HalfVT, HiLoCross1, HiLoCross2);
+ Hi = DAG.getNode(ISD::XOR, DL, HalfVT, LoH, HiLoCross);
+}
+
void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo,
SDValue &Hi) {
EVT VT = N->getValueType(0);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index cd58c8ab1c3e4..71e779f20d20c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -377,6 +377,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
SDValue PromoteIntRes_Rotate(SDNode *N);
SDValue PromoteIntRes_FunnelShift(SDNode *N);
SDValue PromoteIntRes_VPFunnelShift(SDNode *N);
+ SDValue PromoteIntRes_CLMUL(SDNode *N);
SDValue PromoteIntRes_IS_FPCLASS(SDNode *N);
SDValue PromoteIntRes_PATCHPOINT(SDNode *N);
SDValue PromoteIntRes_READ_REGISTER(SDNode *N);
@@ -514,6 +515,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
void ExpandIntRes_Rotate (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_FunnelShift (SDNode *N, SDValue &Lo, SDValue &Hi);
+ void ExpandIntRes_CLMUL(SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_VSCALE (SDNode *N, SDValue &Lo, SDValue &Hi);
void ExpandIntRes_READ_REGISTER(SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 77353bba4955b..293c5b3a858f7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -1157,6 +1157,14 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
return;
}
break;
+ case ISD::CLMUL:
+ case ISD::CLMULR:
+ case ISD::CLMULH:
+ if (SDValue Expanded = TLI.expandCLMUL(Node, DAG)) {
+ Results.push_back(Expanded);
+ return;
+ }
+ break;
case ISD::ROTL:
case ISD::ROTR:
if (SDValue Expanded = TLI.expandROT(Node, false /*AllowVectorOps*/, DAG)) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index af685191d82d8..9f9514684bd63 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -200,6 +200,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
case ISD::SRL:
case ISD::ROTL:
case ISD::ROTR:
+ case ISD::CLMUL:
+ case ISD::CLMULR:
+ case ISD::CLMULH:
R = ScalarizeVecRes_BinOp(N);
break;
@@ -1385,6 +1388,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
case ISD::ADD: case ISD::VP_ADD:
case ISD::SUB: case ISD::VP_SUB:
case ISD::MUL: case ISD::VP_MUL:
+ case ISD::CLMUL:
+ case ISD::CLMULR:
+ case ISD::CLMULH:
case ISD::MULHS:
case ISD::MULHU:
case ISD::ABDS:
@@ -4946,6 +4952,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
case ISD::SHL: case ISD::VP_SHL:
case ISD::SRA: case ISD::VP_SRA:
case ISD::SRL: case ISD::VP_SRL:
+ case ISD::CLMUL:
+ case ISD::CLMULR:
+ case ISD::CLMULH:
case ISD::FMINNUM:
case ISD::FMINNUM_IEEE:
case ISD::VP_FMINNUM:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index da43ae3b28d70..c683e584af48d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6889,6 +6889,12 @@ static std::optional<APInt> FoldValue(unsigned Opcode, const APInt &C1,
return APIntOps::mulhs(C1, C2);
case ISD::MULHU:
return APIntOps::mulhu(C1, C2);
+ case ISD::CLMUL:
+ return APIntOps::clmul(C1, C2);
+ case ISD::CLMULR:
+ return APIntOps::clmulr(C1, C2);
+ case ISD::CLMULH:
+ return APIntOps::clmulh(C1, C2);
}
return std::nullopt;
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b9f3dad406e86..6c07ec9964515 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7321,6 +7321,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
}
return;
}
+ case Intrinsic::clmul: {
+ SDValue X = getValue(I.getArgOperand(0));
+ SDValue Y = getValue(I.getArgOperand(1));
+ setValue(&I, DAG.getNode(ISD::CLMUL, sdl, X.getValueType(), X, Y));
+ return;
+ }
case Intrinsic::sadd_sat: {
SDValue Op1 = getValue(I.getArgOperand(0));
SDValue Op2 = getValue(I.getArgOperand(1));
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 7b24db6cd09d6..9e923997d97de 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -299,6 +299,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
case ISD::ROTR: return "rotr";
case ISD::FSHL: return "fshl";
case ISD::FSHR: return "fshr";
+ case ISD::CLMUL: return "clmul";
+ case ISD::CLMULR: return "clmulr";
+ case ISD::CLMULH: return "clmulh";
case ISD::FADD: return "fadd";
case ISD::STRICT_FADD: return "strict_fadd";
case ISD::FSUB: return "fsub";
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 69c3455573918..b2f0b099f91b9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8372,6 +8372,54 @@ SDValue TargetLowering::expandROT(SDNode *Node, bool AllowVectorOps,
return DAG.getNode(ISD::OR, DL, VT, ShVal, HsVal);
}
+SDValue TargetLowering::expandCLMUL(SDNode *Node, SelectionDAG &DAG) const {
+ SDLoc DL(Node);
+ EVT VT = Node->getValueType(0);
+ SDValue X = Node->getOperand(0);
+ SDValue Y = Node->getOperand(1);
+ unsigned BW = VT.getScalarSizeInBits();
+ unsigned Opcode = Node->getOpcode();
+
+ switch (Opcode) {
+ case ISD::CLMUL: {
+ SDValue Res = DAG.getConstant(0, DL, VT);
+ for (unsigned I = 0; I < BW; ++I) {
+ SDValue Mask = DAG.getConstant(APInt::getOneBitSet(BW, I), DL, VT);
+ SDValue YMasked = DAG.getNode(ISD::AND, DL, VT, Y, Mask);
+ SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, X, YMasked);
+ Res = DAG.getNode(ISD::XOR, DL, VT, Res, Mul);
+ }
+ return Res;
+ }
+ case ISD::CLMULR:
+ case ISD::CLMULH: {
+ EVT ExtVT = VT.changeElementType(
+ *DAG.getContext(), EVT::getIntegerVT(*DAG.getContext(), 2 * BW));
+ // For example, ExtVT = i64 based operations aren't legal on a 32-bit
+ // target; use bitreverse-based lowering in this case.
+ if (!isOperationLegalOrCustom(ISD::ZERO_EXTEND, ExtVT) ||
+ !isOperationLegalOrCustom(ISD::SRL, ExtVT)) {
+ SDValue XRev = DAG.getNode(ISD::BITREVERSE, DL, VT, X);
+ SDValue YRev = DAG.getNode(ISD::BITREVERSE, DL, VT, Y);
+ SDValue ClMul = DAG.getNode(ISD::CLMUL, DL, VT, XRev, YRev);
+ SDValue Res = DAG.getNode(ISD::BITREVERSE, DL, VT, ClMul);
+ if (Opcode == ISD::CLMULH)
+ Res = DAG.getNode(ISD::SRL, DL, VT, Res,
+ DAG.getShiftAmountConstant(1, VT, DL));
+ return Res;
+ }
+ SDValue XExt = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, X);
+ SDValue YExt = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Y);
+ SDValue ClMul = DAG.getNode(ISD::CLMUL, DL, ExtVT, XExt, YExt);
+ unsigned ShAmt = Opcode == ISD::CLMULR ? BW - 1 : BW;
+ SDValue HiBits = DAG.getNode(ISD::SRL, DL, ExtVT, ClMul,
+ DAG.getShiftAmountConstant(ShAmt, ExtVT, DL));
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, HiBits);
+ }
+ }
+ llvm_unreachable("Expected CLMUL, CLMULR, or CLMULH");
+}
+
void TargetLowering::expandShiftParts(SDNode *Node, SDValue &Lo, SDValue &Hi,
SelectionDAG &DAG) const {
assert(Node->getNumOperands() == 3 && "Not a double-shift!");
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 16a5dcbe040a0..1456592af6b2c 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1117,6 +1117,9 @@ void TargetLoweringBase::initActions() {
// Absolute
diff erence
setOperationAction({ISD::ABDS, ISD::ABDU}, VT, Expand);
+ // Carry-less multiply
+ setOperationAction({ISD::CLMUL, ISD::CLMULR, ISD::CLMULH}, VT, Expand);
+
// Saturated trunc
setOperationAction(ISD::TRUNCATE_SSAT_S, VT, Expand);
setOperationAction(ISD::TRUNCATE_SSAT_U, VT, Expand);
diff --git a/llvm/test/CodeGen/RISCV/clmul.ll b/llvm/test/CodeGen/RISCV/clmul.ll
new file mode 100644
index 0000000000000..67004d83881b0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/clmul.ll
@@ -0,0 +1,2986 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32IM
+; RUN: llc -mtriple=riscv64 -mattr=+m -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64IM
+
+define i4 @clmul_i4(i4 %a, i4 %b) nounwind {
+; CHECK-LABEL: clmul_i4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 15
+; CHECK-NEXT: andi a2, a1, 2
+; CHECK-NEXT: andi a3, a1, 1
+; CHECK-NEXT: andi a4, a1, 4
+; CHECK-NEXT: andi a1, a1, 8
+; CHECK-NEXT: mul a2, a0, a2
+; CHECK-NEXT: mul a3, a0, a3
+; CHECK-NEXT: mul a4, a0, a4
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: xor a2, a3, a2
+; CHECK-NEXT: xor a0, a4, a0
+; CHECK-NEXT: xor a0, a2, a0
+; CHECK-NEXT: ret
+ %res = call i4 @llvm.clmul.i4(i4 %a, i4 %b)
+ ret i4 %res
+}
+
+define i8 @clmul_i8(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: clmul_i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: zext.b a0, a0
+; CHECK-NEXT: andi a2, a1, 2
+; CHECK-NEXT: andi a3, a1, 1
+; CHECK-NEXT: andi a4, a1, 4
+; CHECK-NEXT: andi a5, a1, 8
+; CHECK-NEXT: mul a2, a0, a2
+; CHECK-NEXT: mul a3, a0, a3
+; CHECK-NEXT: xor a2, a3, a2
+; CHECK-NEXT: andi a3, a1, 16
+; CHECK-NEXT: mul a4, a0, a4
+; CHECK-NEXT: mul a5, a0, a5
+; CHECK-NEXT: xor a4, a4, a5
+; CHECK-NEXT: andi a5, a1, 32
+; CHECK-NEXT: mul a3, a0, a3
+; CHECK-NEXT: mul a5, a0, a5
+; CHECK-NEXT: xor a3, a3, a5
+; CHECK-NEXT: xor a2, a2, a4
+; CHECK-NEXT: andi a4, a1, 64
+; CHECK-NEXT: andi a1, a1, 128
+; CHECK-NEXT: mul a4, a0, a4
+; CHECK-NEXT: xor a3, a3, a4
+; CHECK-NEXT: xor a2, a2, a3
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: xor a0, a2, a0
+; CHECK-NEXT: ret
+ %res = call i8 @llvm.clmul.i8(i8 %a, i8 %b)
+ ret i8 %res
+}
+
+define i16 @clmul_i16(i16 %a, i16 %b) nounwind {
+; RV32IM-LABEL: clmul_i16:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: andi a2, a1, 2
+; RV32IM-NEXT: andi a3, a1, 1
+; RV32IM-NEXT: andi a4, a1, 4
+; RV32IM-NEXT: andi a5, a1, 8
+; RV32IM-NEXT: andi a6, a1, 16
+; RV32IM-NEXT: andi a7, a1, 32
+; RV32IM-NEXT: srli a0, a0, 16
+; RV32IM-NEXT: mul a2, a0, a2
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: andi a3, a1, 64
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: andi a5, a1, 128
+; RV32IM-NEXT: mul a6, a0, a6
+; RV32IM-NEXT: mul a7, a0, a7
+; RV32IM-NEXT: xor a6, a6, a7
+; RV32IM-NEXT: andi a7, a1, 256
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: mul a7, a0, a7
+; RV32IM-NEXT: xor a5, a5, a7
+; RV32IM-NEXT: andi a7, a1, 512
+; RV32IM-NEXT: xor a2, a2, a4
+; RV32IM-NEXT: li a4, 1
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: xor a3, a6, a3
+; RV32IM-NEXT: lui a6, 1
+; RV32IM-NEXT: mul a7, a0, a7
+; RV32IM-NEXT: xor a5, a5, a7
+; RV32IM-NEXT: lui a7, 2
+; RV32IM-NEXT: slli a4, a4, 11
+; RV32IM-NEXT: and a6, a1, a6
+; RV32IM-NEXT: and a4, a1, a4
+; RV32IM-NEXT: mul a6, a0, a6
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: xor a4, a4, a6
+; RV32IM-NEXT: lui a6, 4
+; RV32IM-NEXT: xor a2, a2, a3
+; RV32IM-NEXT: lui a3, 8
+; RV32IM-NEXT: and a7, a1, a7
+; RV32IM-NEXT: and a6, a1, a6
+; RV32IM-NEXT: and a3, a1, a3
+; RV32IM-NEXT: andi a1, a1, 1024
+; RV32IM-NEXT: mul a1, a0, a1
+; RV32IM-NEXT: xor a1, a5, a1
+; RV32IM-NEXT: mul a5, a0, a7
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: xor a1, a2, a1
+; RV32IM-NEXT: mul a2, a0, a6
+; RV32IM-NEXT: xor a2, a4, a2
+; RV32IM-NEXT: xor a1, a1, a2
+; RV32IM-NEXT: mul a0, a0, a3
+; RV32IM-NEXT: xor a0, a1, a0
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: clmul_i16:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: andi a2, a1, 2
+; RV64IM-NEXT: andi a3, a1, 1
+; RV64IM-NEXT: andi a4, a1, 4
+; RV64IM-NEXT: andi a5, a1, 8
+; RV64IM-NEXT: andi a6, a1, 16
+; RV64IM-NEXT: andi a7, a1, 32
+; RV64IM-NEXT: srli a0, a0, 48
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: xor a2, a3, a2
+; RV64IM-NEXT: andi a3, a1, 64
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: xor a4, a4, a5
+; RV64IM-NEXT: andi a5, a1, 128
+; RV64IM-NEXT: mul a6, a0, a6
+; RV64IM-NEXT: mul a7, a0, a7
+; RV64IM-NEXT: xor a6, a6, a7
+; RV64IM-NEXT: andi a7, a1, 256
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: mul a7, a0, a7
+; RV64IM-NEXT: xor a5, a5, a7
+; RV64IM-NEXT: andi a7, a1, 512
+; RV64IM-NEXT: xor a2, a2, a4
+; RV64IM-NEXT: li a4, 1
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: xor a3, a6, a3
+; RV64IM-NEXT: lui a6, 1
+; RV64IM-NEXT: mul a7, a0, a7
+; RV64IM-NEXT: xor a5, a5, a7
+; RV64IM-NEXT: lui a7, 2
+; RV64IM-NEXT: slli a4, a4, 11
+; RV64IM-NEXT: and a6, a1, a6
+; RV64IM-NEXT: and a4, a1, a4
+; RV64IM-NEXT: mul a6, a0, a6
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: xor a4, a4, a6
+; RV64IM-NEXT: lui a6, 4
+; RV64IM-NEXT: xor a2, a2, a3
+; RV64IM-NEXT: lui a3, 8
+; RV64IM-NEXT: and a7, a1, a7
+; RV64IM-NEXT: and a6, a1, a6
+; RV64IM-NEXT: and a3, a1, a3
+; RV64IM-NEXT: andi a1, a1, 1024
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: xor a1, a5, a1
+; RV64IM-NEXT: mul a5, a0, a7
+; RV64IM-NEXT: xor a4, a4, a5
+; RV64IM-NEXT: xor a1, a2, a1
+; RV64IM-NEXT: mul a2, a0, a6
+; RV64IM-NEXT: xor a2, a4, a2
+; RV64IM-NEXT: xor a1, a1, a2
+; RV64IM-NEXT: mul a0, a0, a3
+; RV64IM-NEXT: xor a0, a1, a0
+; RV64IM-NEXT: ret
+ %res = call i16 @llvm.clmul.i16(i16 %a, i16 %b)
+ ret i16 %res
+}
+
+define i32 @clmul_i32(i32 %a, i32 %b) nounwind {
+; RV32IM-LABEL: clmul_i32:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: addi sp, sp, -48
+; RV32IM-NEXT: sw s0, 44(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s1, 40(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s2, 36(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s3, 32(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s4, 28(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s5, 24(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s6, 20(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s7, 16(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s8, 12(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s9, 8(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s10, 4(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: andi t6, a1, 2
+; RV32IM-NEXT: andi s1, a1, 1
+; RV32IM-NEXT: andi a7, a1, 4
+; RV32IM-NEXT: andi t2, a1, 8
+; RV32IM-NEXT: andi t0, a1, 16
+; RV32IM-NEXT: andi t3, a1, 32
+; RV32IM-NEXT: andi a2, a1, 64
+; RV32IM-NEXT: andi t4, a1, 128
+; RV32IM-NEXT: andi s0, a1, 256
+; RV32IM-NEXT: andi a3, a1, 512
+; RV32IM-NEXT: li a4, 1
+; RV32IM-NEXT: lui a5, 1
+; RV32IM-NEXT: lui a6, 2
+; RV32IM-NEXT: lui t1, 4
+; RV32IM-NEXT: lui t5, 8
+; RV32IM-NEXT: lui s2, 16
+; RV32IM-NEXT: lui s3, 32
+; RV32IM-NEXT: lui s4, 64
+; RV32IM-NEXT: lui s5, 128
+; RV32IM-NEXT: lui s6, 256
+; RV32IM-NEXT: lui s7, 512
+; RV32IM-NEXT: lui s8, 1024
+; RV32IM-NEXT: lui s9, 2048
+; RV32IM-NEXT: lui s10, 4096
+; RV32IM-NEXT: mul t6, a0, t6
+; RV32IM-NEXT: mul s1, a0, s1
+; RV32IM-NEXT: xor t6, s1, t6
+; RV32IM-NEXT: lui s1, 8192
+; RV32IM-NEXT: mul a7, a0, a7
+; RV32IM-NEXT: mul t2, a0, t2
+; RV32IM-NEXT: xor a7, a7, t2
+; RV32IM-NEXT: lui t2, 16384
+; RV32IM-NEXT: mul t0, a0, t0
+; RV32IM-NEXT: mul t3, a0, t3
+; RV32IM-NEXT: xor t0, t0, t3
+; RV32IM-NEXT: lui t3, 32768
+; RV32IM-NEXT: mul t4, a0, t4
+; RV32IM-NEXT: mul s0, a0, s0
+; RV32IM-NEXT: xor t4, t4, s0
+; RV32IM-NEXT: lui s0, 65536
+; RV32IM-NEXT: xor a7, t6, a7
+; RV32IM-NEXT: lui t6, 131072
+; RV32IM-NEXT: mul a2, a0, a2
+; RV32IM-NEXT: xor a2, t0, a2
+; RV32IM-NEXT: lui t0, 262144
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: xor a3, t4, a3
+; RV32IM-NEXT: lui t4, 524288
+; RV32IM-NEXT: slli a4, a4, 11
+; RV32IM-NEXT: and a5, a1, a5
+; RV32IM-NEXT: and a6, a1, a6
+; RV32IM-NEXT: and t1, a1, t1
+; RV32IM-NEXT: and t5, a1, t5
+; RV32IM-NEXT: and s2, a1, s2
+; RV32IM-NEXT: and s3, a1, s3
+; RV32IM-NEXT: and s4, a1, s4
+; RV32IM-NEXT: and s5, a1, s5
+; RV32IM-NEXT: and s6, a1, s6
+; RV32IM-NEXT: and s7, a1, s7
+; RV32IM-NEXT: and s8, a1, s8
+; RV32IM-NEXT: and s9, a1, s9
+; RV32IM-NEXT: and s10, a1, s10
+; RV32IM-NEXT: and s1, a1, s1
+; RV32IM-NEXT: and t2, a1, t2
+; RV32IM-NEXT: and t3, a1, t3
+; RV32IM-NEXT: and s0, a1, s0
+; RV32IM-NEXT: and t6, a1, t6
+; RV32IM-NEXT: and t0, a1, t0
+; RV32IM-NEXT: and t4, a1, t4
+; RV32IM-NEXT: and a4, a1, a4
+; RV32IM-NEXT: andi a1, a1, 1024
+; RV32IM-NEXT: mul a1, a0, a1
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: mul a6, a0, a6
+; RV32IM-NEXT: mul t1, a0, t1
+; RV32IM-NEXT: mul t5, a0, t5
+; RV32IM-NEXT: mul s2, a0, s2
+; RV32IM-NEXT: mul s3, a0, s3
+; RV32IM-NEXT: mul s4, a0, s4
+; RV32IM-NEXT: mul s5, a0, s5
+; RV32IM-NEXT: mul s6, a0, s6
+; RV32IM-NEXT: mul s7, a0, s7
+; RV32IM-NEXT: mul s8, a0, s8
+; RV32IM-NEXT: mul s9, a0, s9
+; RV32IM-NEXT: mul s10, a0, s10
+; RV32IM-NEXT: mul s1, a0, s1
+; RV32IM-NEXT: mul t2, a0, t2
+; RV32IM-NEXT: mul t3, a0, t3
+; RV32IM-NEXT: mul s0, a0, s0
+; RV32IM-NEXT: mul t6, a0, t6
+; RV32IM-NEXT: mul t0, a0, t0
+; RV32IM-NEXT: mul t4, a0, t4
+; RV32IM-NEXT: mul a0, a0, a4
+; RV32IM-NEXT: xor a4, t1, t5
+; RV32IM-NEXT: xor t1, s5, s6
+; RV32IM-NEXT: xor t2, s1, t2
+; RV32IM-NEXT: xor a2, a7, a2
+; RV32IM-NEXT: xor a1, a3, a1
+; RV32IM-NEXT: xor a0, a0, a5
+; RV32IM-NEXT: xor a3, a4, s2
+; RV32IM-NEXT: xor a4, t1, s7
+; RV32IM-NEXT: xor a5, t2, t3
+; RV32IM-NEXT: xor a1, a2, a1
+; RV32IM-NEXT: xor a0, a0, a6
+; RV32IM-NEXT: xor a2, a3, s3
+; RV32IM-NEXT: xor a3, a4, s8
+; RV32IM-NEXT: xor a5, a5, s0
+; RV32IM-NEXT: xor a0, a1, a0
+; RV32IM-NEXT: xor a1, a2, s4
+; RV32IM-NEXT: xor a2, a3, s9
+; RV32IM-NEXT: xor a3, a5, t6
+; RV32IM-NEXT: xor a0, a0, a1
+; RV32IM-NEXT: xor a1, a2, s10
+; RV32IM-NEXT: xor a2, a3, t0
+; RV32IM-NEXT: xor a0, a0, a1
+; RV32IM-NEXT: xor a1, a2, t4
+; RV32IM-NEXT: xor a0, a0, a1
+; RV32IM-NEXT: lw s0, 44(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s1, 40(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s2, 36(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s3, 32(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s4, 28(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s5, 24(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s6, 20(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s7, 16(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s8, 12(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s9, 8(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s10, 4(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: addi sp, sp, 48
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: clmul_i32:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: addi sp, sp, -128
+; RV64IM-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s1, 104(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s2, 96(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s3, 88(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s4, 80(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s5, 72(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s6, 64(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s7, 56(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s8, 48(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s9, 40(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s10, 32(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s11, 24(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a6, a0, 32
+; RV64IM-NEXT: andi t1, a1, 2
+; RV64IM-NEXT: andi t3, a1, 1
+; RV64IM-NEXT: andi a5, a1, 4
+; RV64IM-NEXT: andi a7, a1, 8
+; RV64IM-NEXT: andi a3, a1, 16
+; RV64IM-NEXT: andi a4, a1, 32
+; RV64IM-NEXT: andi a0, a1, 64
+; RV64IM-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: andi t0, a1, 128
+; RV64IM-NEXT: andi t2, a1, 256
+; RV64IM-NEXT: andi a0, a1, 512
+; RV64IM-NEXT: sd a0, 8(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: li a2, 1
+; RV64IM-NEXT: lui t5, 1
+; RV64IM-NEXT: lui t6, 2
+; RV64IM-NEXT: lui s0, 4
+; RV64IM-NEXT: lui s2, 8
+; RV64IM-NEXT: lui s3, 16
+; RV64IM-NEXT: lui s4, 32
+; RV64IM-NEXT: lui s5, 64
+; RV64IM-NEXT: lui s6, 128
+; RV64IM-NEXT: lui s7, 256
+; RV64IM-NEXT: lui s8, 512
+; RV64IM-NEXT: lui s9, 1024
+; RV64IM-NEXT: lui s10, 2048
+; RV64IM-NEXT: lui s11, 4096
+; RV64IM-NEXT: lui ra, 8192
+; RV64IM-NEXT: lui a0, 16384
+; RV64IM-NEXT: srli s1, a6, 32
+; RV64IM-NEXT: mul a6, s1, t1
+; RV64IM-NEXT: mul t1, s1, t3
+; RV64IM-NEXT: xor a6, t1, a6
+; RV64IM-NEXT: sd a6, 0(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: lui t1, 32768
+; RV64IM-NEXT: mul a5, s1, a5
+; RV64IM-NEXT: mul a7, s1, a7
+; RV64IM-NEXT: xor t4, a5, a7
+; RV64IM-NEXT: lui a7, 65536
+; RV64IM-NEXT: mul a3, s1, a3
+; RV64IM-NEXT: mul a4, s1, a4
+; RV64IM-NEXT: xor a6, a3, a4
+; RV64IM-NEXT: lui t3, 131072
+; RV64IM-NEXT: mul a4, s1, t0
+; RV64IM-NEXT: mul t0, s1, t2
+; RV64IM-NEXT: xor a5, a4, t0
+; RV64IM-NEXT: lui t0, 262144
+; RV64IM-NEXT: slli t2, a2, 11
+; RV64IM-NEXT: and t5, a1, t5
+; RV64IM-NEXT: and t6, a1, t6
+; RV64IM-NEXT: and s0, a1, s0
+; RV64IM-NEXT: and s2, a1, s2
+; RV64IM-NEXT: and s3, a1, s3
+; RV64IM-NEXT: and s4, a1, s4
+; RV64IM-NEXT: and s5, a1, s5
+; RV64IM-NEXT: and s6, a1, s6
+; RV64IM-NEXT: and s7, a1, s7
+; RV64IM-NEXT: and s8, a1, s8
+; RV64IM-NEXT: and s9, a1, s9
+; RV64IM-NEXT: and s10, a1, s10
+; RV64IM-NEXT: and s11, a1, s11
+; RV64IM-NEXT: and ra, a1, ra
+; RV64IM-NEXT: and a2, a1, a0
+; RV64IM-NEXT: and t1, a1, t1
+; RV64IM-NEXT: and a7, a1, a7
+; RV64IM-NEXT: and t3, a1, t3
+; RV64IM-NEXT: and t0, a1, t0
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: andi a0, a1, 1024
+; RV64IM-NEXT: srliw a1, a1, 31
+; RV64IM-NEXT: slli a1, a1, 31
+; RV64IM-NEXT: ld a3, 16(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a3, s1, a3
+; RV64IM-NEXT: ld a4, 8(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a4, s1, a4
+; RV64IM-NEXT: mul a0, s1, a0
+; RV64IM-NEXT: mul t5, s1, t5
+; RV64IM-NEXT: mul t6, s1, t6
+; RV64IM-NEXT: mul s0, s1, s0
+; RV64IM-NEXT: mul s2, s1, s2
+; RV64IM-NEXT: mul s3, s1, s3
+; RV64IM-NEXT: mul s4, s1, s4
+; RV64IM-NEXT: mul s5, s1, s5
+; RV64IM-NEXT: mul s6, s1, s6
+; RV64IM-NEXT: mul s7, s1, s7
+; RV64IM-NEXT: mul s8, s1, s8
+; RV64IM-NEXT: mul s9, s1, s9
+; RV64IM-NEXT: mul s10, s1, s10
+; RV64IM-NEXT: mul s11, s1, s11
+; RV64IM-NEXT: mul ra, s1, ra
+; RV64IM-NEXT: mul a2, s1, a2
+; RV64IM-NEXT: mul t1, s1, t1
+; RV64IM-NEXT: mul a7, s1, a7
+; RV64IM-NEXT: mul t3, s1, t3
+; RV64IM-NEXT: mul t0, s1, t0
+; RV64IM-NEXT: mul a1, s1, a1
+; RV64IM-NEXT: mul t2, s1, t2
+; RV64IM-NEXT: xor s1, s2, s3
+; RV64IM-NEXT: xor s2, s8, s9
+; RV64IM-NEXT: xor a7, a7, t3
+; RV64IM-NEXT: ld t3, 0(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor t3, t3, t4
+; RV64IM-NEXT: xor a3, a6, a3
+; RV64IM-NEXT: xor a4, a5, a4
+; RV64IM-NEXT: xor a5, t2, t5
+; RV64IM-NEXT: xor a6, s1, s4
+; RV64IM-NEXT: xor t2, s2, s10
+; RV64IM-NEXT: xor a7, a7, t0
+; RV64IM-NEXT: xor a3, t3, a3
+; RV64IM-NEXT: xor a0, a4, a0
+; RV64IM-NEXT: xor a4, a5, t6
+; RV64IM-NEXT: xor a5, a6, s5
+; RV64IM-NEXT: xor a6, t2, s11
+; RV64IM-NEXT: xor a0, a3, a0
+; RV64IM-NEXT: xor a4, a4, s0
+; RV64IM-NEXT: xor a3, a5, s6
+; RV64IM-NEXT: xor a5, a6, ra
+; RV64IM-NEXT: xor a0, a0, a4
+; RV64IM-NEXT: xor a3, a3, s7
+; RV64IM-NEXT: xor a2, a5, a2
+; RV64IM-NEXT: xor a0, a0, a3
+; RV64IM-NEXT: xor a2, a2, t1
+; RV64IM-NEXT: xor a0, a0, a2
+; RV64IM-NEXT: xor a1, a7, a1
+; RV64IM-NEXT: xor a0, a0, a1
+; RV64IM-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s1, 104(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s2, 96(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s3, 88(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s4, 80(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s5, 72(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s6, 64(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s7, 56(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s8, 48(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s9, 40(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s10, 32(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s11, 24(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: addi sp, sp, 128
+; RV64IM-NEXT: ret
+ %res = call i32 @llvm.clmul.i32(i32 %a, i32 %b)
+ ret i32 %res
+}
+
+define i64 @clmul_i64(i64 %a, i64 %b) nounwind {
+; RV32IM-LABEL: clmul_i64:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: addi sp, sp, -272
+; RV32IM-NEXT: sw ra, 268(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s0, 264(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s1, 260(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s2, 256(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s3, 252(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s4, 248(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s5, 244(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s6, 240(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s7, 236(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s8, 232(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s9, 228(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s10, 224(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s11, 220(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mv t1, a1
+; RV32IM-NEXT: srli a7, a0, 8
+; RV32IM-NEXT: lui s11, 16
+; RV32IM-NEXT: srli t0, a0, 24
+; RV32IM-NEXT: srli a1, a2, 8
+; RV32IM-NEXT: srli t2, a2, 24
+; RV32IM-NEXT: andi t3, a2, 2
+; RV32IM-NEXT: andi t5, a2, 1
+; RV32IM-NEXT: andi t6, a2, 4
+; RV32IM-NEXT: andi s0, a2, 8
+; RV32IM-NEXT: andi s1, a2, 16
+; RV32IM-NEXT: andi s2, a2, 32
+; RV32IM-NEXT: andi t4, a2, 128
+; RV32IM-NEXT: andi s4, a2, 256
+; RV32IM-NEXT: andi a4, a3, 2
+; RV32IM-NEXT: andi a5, a3, 1
+; RV32IM-NEXT: andi s7, a3, 4
+; RV32IM-NEXT: andi s8, a3, 8
+; RV32IM-NEXT: mul a6, t1, t3
+; RV32IM-NEXT: mul s3, t1, t5
+; RV32IM-NEXT: mul s5, t1, t6
+; RV32IM-NEXT: mul s6, t1, s0
+; RV32IM-NEXT: mul s9, t1, s1
+; RV32IM-NEXT: xor a6, s3, a6
+; RV32IM-NEXT: mul s3, t1, s2
+; RV32IM-NEXT: xor s5, s5, s6
+; RV32IM-NEXT: mul s6, t1, t4
+; RV32IM-NEXT: xor s3, s9, s3
+; RV32IM-NEXT: mul s9, t1, s4
+; RV32IM-NEXT: xor s6, s6, s9
+; RV32IM-NEXT: andi s9, a3, 16
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a4, a5, a4
+; RV32IM-NEXT: andi s10, a3, 32
+; RV32IM-NEXT: mul a5, a0, s7
+; RV32IM-NEXT: mul s7, a0, s8
+; RV32IM-NEXT: xor a5, a5, s7
+; RV32IM-NEXT: andi s8, a3, 128
+; RV32IM-NEXT: mul s7, a0, s9
+; RV32IM-NEXT: mul s9, a0, s10
+; RV32IM-NEXT: xor s7, s7, s9
+; RV32IM-NEXT: andi s9, a3, 256
+; RV32IM-NEXT: mul s8, a0, s8
+; RV32IM-NEXT: mul s9, a0, s9
+; RV32IM-NEXT: xor s8, s8, s9
+; RV32IM-NEXT: mul t3, a0, t3
+; RV32IM-NEXT: mul t5, a0, t5
+; RV32IM-NEXT: xor t5, t5, t3
+; RV32IM-NEXT: andi t3, a2, 64
+; RV32IM-NEXT: mul t6, a0, t6
+; RV32IM-NEXT: mul s0, a0, s0
+; RV32IM-NEXT: xor s0, t6, s0
+; RV32IM-NEXT: andi t6, a2, 512
+; RV32IM-NEXT: mul s1, a0, s1
+; RV32IM-NEXT: mul s2, a0, s2
+; RV32IM-NEXT: xor s1, s1, s2
+; RV32IM-NEXT: addi s9, s11, -256
+; RV32IM-NEXT: sw s9, 216(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul t4, a0, t4
+; RV32IM-NEXT: mul s2, a0, s4
+; RV32IM-NEXT: xor t4, t4, s2
+; RV32IM-NEXT: mul s2, t1, t3
+; RV32IM-NEXT: and a7, a7, s9
+; RV32IM-NEXT: or a7, a7, t0
+; RV32IM-NEXT: sw a7, 212(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a7, t1, t6
+; RV32IM-NEXT: and t0, a1, s9
+; RV32IM-NEXT: or a1, t0, t2
+; RV32IM-NEXT: sw a1, 208(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui s9, 4
+; RV32IM-NEXT: and t2, a2, s9
+; RV32IM-NEXT: xor a1, a6, s5
+; RV32IM-NEXT: sw a1, 204(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui s5, 8
+; RV32IM-NEXT: and s4, a2, s5
+; RV32IM-NEXT: xor a1, s3, s2
+; RV32IM-NEXT: sw a1, 200(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a6, t1, t2
+; RV32IM-NEXT: xor a1, s6, a7
+; RV32IM-NEXT: sw a1, 196(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a7, t1, s4
+; RV32IM-NEXT: xor a1, a6, a7
+; RV32IM-NEXT: sw a1, 192(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a1, 256
+; RV32IM-NEXT: lui s6, 128
+; RV32IM-NEXT: and t0, a2, s6
+; RV32IM-NEXT: and a1, a2, a1
+; RV32IM-NEXT: lui s10, 256
+; RV32IM-NEXT: mul a6, t1, t0
+; RV32IM-NEXT: mul a7, t1, a1
+; RV32IM-NEXT: xor a6, a6, a7
+; RV32IM-NEXT: sw a6, 188(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a6, 8192
+; RV32IM-NEXT: lui a7, 16384
+; RV32IM-NEXT: and a6, a2, a6
+; RV32IM-NEXT: lui s11, 8192
+; RV32IM-NEXT: and a7, a2, a7
+; RV32IM-NEXT: lui ra, 16384
+; RV32IM-NEXT: mul s2, t1, a6
+; RV32IM-NEXT: mul s3, t1, a7
+; RV32IM-NEXT: xor s2, s2, s3
+; RV32IM-NEXT: sw s2, 180(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: sw a4, 184(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: andi a4, a3, 64
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: xor a4, s7, a4
+; RV32IM-NEXT: sw a4, 176(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: andi a4, a3, 512
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: xor a4, s8, a4
+; RV32IM-NEXT: sw a4, 172(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a3, s9
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: and a5, a3, s5
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: sw a4, 168(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a3, s6
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: and a5, a3, s10
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: sw a4, 164(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a3, s11
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: and a5, a3, ra
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: sw a4, 156(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: xor a4, t5, s0
+; RV32IM-NEXT: sw a4, 160(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a4, a0, t3
+; RV32IM-NEXT: xor a4, s1, a4
+; RV32IM-NEXT: sw a4, 152(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a4, a0, t6
+; RV32IM-NEXT: xor a4, t4, a4
+; RV32IM-NEXT: sw a4, 148(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a4, a0, t2
+; RV32IM-NEXT: mul a5, a0, s4
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: sw a4, 144(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a4, a0, t0
+; RV32IM-NEXT: mul a5, a0, a1
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: sw a4, 140(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a4, a0, a6
+; RV32IM-NEXT: mul a5, a0, a7
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: sw a4, 132(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: li a4, 1
+; RV32IM-NEXT: slli s6, a4, 11
+; RV32IM-NEXT: andi a4, a3, 1024
+; RV32IM-NEXT: mul a1, a0, a4
+; RV32IM-NEXT: sw a1, 116(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: andi a4, a2, 1024
+; RV32IM-NEXT: mul a1, t1, a4
+; RV32IM-NEXT: sw a1, 104(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, a4
+; RV32IM-NEXT: sw a1, 124(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a4, 1
+; RV32IM-NEXT: lui a5, 2
+; RV32IM-NEXT: lui a1, 32
+; RV32IM-NEXT: lui s3, 64
+; RV32IM-NEXT: lui t2, 512
+; RV32IM-NEXT: lui t3, 1024
+; RV32IM-NEXT: lui t4, 2048
+; RV32IM-NEXT: lui t5, 4096
+; RV32IM-NEXT: lui t6, 32768
+; RV32IM-NEXT: lui s0, 65536
+; RV32IM-NEXT: lui s1, 131072
+; RV32IM-NEXT: lui s2, 262144
+; RV32IM-NEXT: lui s4, 524288
+; RV32IM-NEXT: and a4, a3, a4
+; RV32IM-NEXT: lui s7, 1
+; RV32IM-NEXT: and a5, a3, a5
+; RV32IM-NEXT: lui s9, 2
+; RV32IM-NEXT: lui t0, 16
+; RV32IM-NEXT: and a6, a3, t0
+; RV32IM-NEXT: and a7, a3, a1
+; RV32IM-NEXT: lui s8, 32
+; RV32IM-NEXT: and a1, a3, s3
+; RV32IM-NEXT: lui s10, 64
+; RV32IM-NEXT: and t2, a3, t2
+; RV32IM-NEXT: lui s11, 512
+; RV32IM-NEXT: and t3, a3, t3
+; RV32IM-NEXT: lui ra, 1024
+; RV32IM-NEXT: and t4, a3, t4
+; RV32IM-NEXT: and t5, a3, t5
+; RV32IM-NEXT: and t6, a3, t6
+; RV32IM-NEXT: and s0, a3, s0
+; RV32IM-NEXT: and s1, a3, s1
+; RV32IM-NEXT: and s3, a3, s2
+; RV32IM-NEXT: and s5, a3, s4
+; RV32IM-NEXT: and s2, a3, s6
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: mul a3, a0, a5
+; RV32IM-NEXT: sw a3, 96(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a3, a0, a6
+; RV32IM-NEXT: sw a3, 48(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a3, a0, a7
+; RV32IM-NEXT: sw a3, 92(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, a1
+; RV32IM-NEXT: sw a1, 112(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, t2
+; RV32IM-NEXT: sw a1, 40(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, t3
+; RV32IM-NEXT: sw a1, 88(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, t4
+; RV32IM-NEXT: sw a1, 108(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, t5
+; RV32IM-NEXT: sw a1, 128(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, t6
+; RV32IM-NEXT: sw a1, 32(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, s0
+; RV32IM-NEXT: sw a1, 84(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, s1
+; RV32IM-NEXT: sw a1, 100(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, s3
+; RV32IM-NEXT: sw a1, 120(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, s5
+; RV32IM-NEXT: sw a1, 136(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a2, s7
+; RV32IM-NEXT: mul t5, t1, a3
+; RV32IM-NEXT: mul a1, a0, a3
+; RV32IM-NEXT: sw a1, 16(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a2, s9
+; RV32IM-NEXT: mul a1, t1, a3
+; RV32IM-NEXT: sw a1, 36(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, a3
+; RV32IM-NEXT: sw a1, 64(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a2, t0
+; RV32IM-NEXT: mul t3, t1, a3
+; RV32IM-NEXT: mul s9, a0, a3
+; RV32IM-NEXT: and a3, a2, s8
+; RV32IM-NEXT: mul a1, t1, a3
+; RV32IM-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, a3
+; RV32IM-NEXT: sw a1, 44(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a2, s10
+; RV32IM-NEXT: mul a1, t1, a3
+; RV32IM-NEXT: sw a1, 60(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, a3
+; RV32IM-NEXT: sw a1, 80(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a2, s11
+; RV32IM-NEXT: mul a7, t1, a3
+; RV32IM-NEXT: mul s4, a0, a3
+; RV32IM-NEXT: and a6, a2, ra
+; RV32IM-NEXT: mul s11, t1, a6
+; RV32IM-NEXT: mul a1, a0, a6
+; RV32IM-NEXT: sw a1, 24(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a1, 2048
+; RV32IM-NEXT: and a6, a2, a1
+; RV32IM-NEXT: mul a1, t1, a6
+; RV32IM-NEXT: sw a1, 28(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, a6
+; RV32IM-NEXT: sw a1, 52(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a1, 4096
+; RV32IM-NEXT: and a6, a2, a1
+; RV32IM-NEXT: mul a1, t1, a6
+; RV32IM-NEXT: sw a1, 56(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, a6
+; RV32IM-NEXT: sw a1, 76(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a1, 32768
+; RV32IM-NEXT: and a6, a2, a1
+; RV32IM-NEXT: mul a5, t1, a6
+; RV32IM-NEXT: mul s1, a0, a6
+; RV32IM-NEXT: lui a1, 65536
+; RV32IM-NEXT: and a6, a2, a1
+; RV32IM-NEXT: mul s7, t1, a6
+; RV32IM-NEXT: mul s8, a0, a6
+; RV32IM-NEXT: lui a1, 131072
+; RV32IM-NEXT: and t6, a2, a1
+; RV32IM-NEXT: mul s10, t1, t6
+; RV32IM-NEXT: mul ra, a0, t6
+; RV32IM-NEXT: lui a1, 262144
+; RV32IM-NEXT: and s5, a2, a1
+; RV32IM-NEXT: mul a1, t1, s5
+; RV32IM-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul s5, a0, s5
+; RV32IM-NEXT: lui a1, 524288
+; RV32IM-NEXT: and a1, a2, a1
+; RV32IM-NEXT: mul a3, t1, a1
+; RV32IM-NEXT: sw a3, 68(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a0, a1
+; RV32IM-NEXT: sw a1, 72(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a1, a2, s6
+; RV32IM-NEXT: mul t1, t1, a1
+; RV32IM-NEXT: mul s2, a0, s2
+; RV32IM-NEXT: mul t6, a0, a1
+; RV32IM-NEXT: slli a1, a0, 24
+; RV32IM-NEXT: lw a3, 216(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: and a0, a0, a3
+; RV32IM-NEXT: slli a0, a0, 8
+; RV32IM-NEXT: or s3, a1, a0
+; RV32IM-NEXT: slli a1, a2, 24
+; RV32IM-NEXT: and a2, a2, a3
+; RV32IM-NEXT: slli a2, a2, 8
+; RV32IM-NEXT: or t2, a1, a2
+; RV32IM-NEXT: lw a0, 204(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw a1, 200(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t4, a0, a1
+; RV32IM-NEXT: lw a0, 196(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw a1, 104(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a6, a0, a1
+; RV32IM-NEXT: xor t1, t1, t5
+; RV32IM-NEXT: lw a0, 192(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t3, a0, t3
+; RV32IM-NEXT: lw a0, 188(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a7, a0, a7
+; RV32IM-NEXT: lw a0, 180(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t0, a0, a5
+; RV32IM-NEXT: lw a0, 184(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw a1, 176(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t5, a0, a1
+; RV32IM-NEXT: lw a0, 172(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw a5, 116(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a5, a0, a5
+; RV32IM-NEXT: xor s2, s2, a4
+; RV32IM-NEXT: lw a0, 168(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw a1, 48(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a1, a0, a1
+; RV32IM-NEXT: lw a0, 164(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw a2, 40(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a2, a0, a2
+; RV32IM-NEXT: lw a0, 156(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw a3, 32(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, a0, a3
+; RV32IM-NEXT: lw a0, 160(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw a4, 152(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a4, a0, a4
+; RV32IM-NEXT: lw a0, 148(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s0, 124(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a0, a0, s0
+; RV32IM-NEXT: lw s0, 16(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t6, t6, s0
+; RV32IM-NEXT: lw s0, 144(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor s9, s0, s9
+; RV32IM-NEXT: lw s0, 140(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor s4, s0, s4
+; RV32IM-NEXT: lw s0, 132(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor s1, s0, s1
+; RV32IM-NEXT: lw s0, 212(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: or s0, s3, s0
+; RV32IM-NEXT: lw s3, 208(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: or t2, t2, s3
+; RV32IM-NEXT: xor a6, t4, a6
+; RV32IM-NEXT: lw t4, 36(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t1, t1, t4
+; RV32IM-NEXT: lw t4, 20(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t3, t3, t4
+; RV32IM-NEXT: xor a7, a7, s11
+; RV32IM-NEXT: xor t0, t0, s7
+; RV32IM-NEXT: xor a5, t5, a5
+; RV32IM-NEXT: lw t4, 96(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t4, s2, t4
+; RV32IM-NEXT: lw t5, 92(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a1, a1, t5
+; RV32IM-NEXT: lw t5, 88(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a2, a2, t5
+; RV32IM-NEXT: lw t5, 84(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, a3, t5
+; RV32IM-NEXT: xor a0, a4, a0
+; RV32IM-NEXT: lw a4, 64(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a4, t6, a4
+; RV32IM-NEXT: lw t5, 44(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t5, s9, t5
+; RV32IM-NEXT: lw t6, 24(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t6, s4, t6
+; RV32IM-NEXT: xor s1, s1, s8
+; RV32IM-NEXT: xor a6, a6, t1
+; RV32IM-NEXT: lw t1, 60(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t1, t3, t1
+; RV32IM-NEXT: lw t3, 28(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a7, a7, t3
+; RV32IM-NEXT: xor t0, t0, s10
+; RV32IM-NEXT: xor a5, a5, t4
+; RV32IM-NEXT: lw t3, 112(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a1, a1, t3
+; RV32IM-NEXT: lw t3, 108(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a2, a2, t3
+; RV32IM-NEXT: lw t3, 100(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, a3, t3
+; RV32IM-NEXT: xor a0, a0, a4
+; RV32IM-NEXT: lw a4, 80(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a4, t5, a4
+; RV32IM-NEXT: lw t3, 52(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t3, t6, t3
+; RV32IM-NEXT: xor t4, s1, ra
+; RV32IM-NEXT: xor a6, a6, t1
+; RV32IM-NEXT: lw t1, 56(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a7, a7, t1
+; RV32IM-NEXT: lw t1, 12(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t0, t0, t1
+; RV32IM-NEXT: xor a5, a5, a1
+; RV32IM-NEXT: lw a1, 128(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a2, a2, a1
+; RV32IM-NEXT: lw a1, 120(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, a3, a1
+; RV32IM-NEXT: xor a0, a0, a4
+; RV32IM-NEXT: lw a1, 76(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a4, t3, a1
+; RV32IM-NEXT: xor t1, t4, s5
+; RV32IM-NEXT: lui a1, 61681
+; RV32IM-NEXT: addi t5, a1, -241
+; RV32IM-NEXT: srli t3, s0, 4
+; RV32IM-NEXT: and s0, s0, t5
+; RV32IM-NEXT: and t3, t3, t5
+; RV32IM-NEXT: slli s0, s0, 4
+; RV32IM-NEXT: or t3, t3, s0
+; RV32IM-NEXT: srli t4, t2, 4
+; RV32IM-NEXT: and t2, t2, t5
+; RV32IM-NEXT: and t4, t4, t5
+; RV32IM-NEXT: slli t2, t2, 4
+; RV32IM-NEXT: or t2, t4, t2
+; RV32IM-NEXT: xor a6, a6, a7
+; RV32IM-NEXT: lw a1, 68(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a7, t0, a1
+; RV32IM-NEXT: xor a5, a5, a2
+; RV32IM-NEXT: lw a1, 136(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, a3, a1
+; RV32IM-NEXT: xor a0, a0, a4
+; RV32IM-NEXT: sw a0, 212(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lw a0, 72(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a0, t1, a0
+; RV32IM-NEXT: sw a0, 208(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: xor a4, a6, a7
+; RV32IM-NEXT: xor a3, a5, a3
+; RV32IM-NEXT: xor a3, a3, a4
+; RV32IM-NEXT: sw a3, 204(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a4, 209715
+; RV32IM-NEXT: addi t4, a4, 819
+; RV32IM-NEXT: srli a5, t3, 2
+; RV32IM-NEXT: and a6, t3, t4
+; RV32IM-NEXT: and a5, a5, t4
+; RV32IM-NEXT: slli a6, a6, 2
+; RV32IM-NEXT: or a7, a5, a6
+; RV32IM-NEXT: srli a5, t2, 2
+; RV32IM-NEXT: and a6, t2, t4
+; RV32IM-NEXT: and a5, a5, t4
+; RV32IM-NEXT: slli a6, a6, 2
+; RV32IM-NEXT: or t0, a5, a6
+; RV32IM-NEXT: lui t6, 349525
+; RV32IM-NEXT: addi t6, t6, 1365
+; RV32IM-NEXT: srli t1, a7, 1
+; RV32IM-NEXT: and a7, a7, t6
+; RV32IM-NEXT: and t1, t1, t6
+; RV32IM-NEXT: slli a7, a7, 1
+; RV32IM-NEXT: or a7, t1, a7
+; RV32IM-NEXT: srli t1, t0, 1
+; RV32IM-NEXT: and t0, t0, t6
+; RV32IM-NEXT: and t1, t1, t6
+; RV32IM-NEXT: slli t0, t0, 1
+; RV32IM-NEXT: or t0, t1, t0
+; RV32IM-NEXT: and s0, t0, s6
+; RV32IM-NEXT: lui a0, 1
+; RV32IM-NEXT: and s2, t0, a0
+; RV32IM-NEXT: lui a0, 2
+; RV32IM-NEXT: and s3, t0, a0
+; RV32IM-NEXT: lui a0, 4
+; RV32IM-NEXT: and s5, t0, a0
+; RV32IM-NEXT: lui a0, 8
+; RV32IM-NEXT: and s6, t0, a0
+; RV32IM-NEXT: lui a0, 16
+; RV32IM-NEXT: and s10, t0, a0
+; RV32IM-NEXT: lui a0, 32
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 196(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 64
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 192(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 128
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 188(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 256
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 184(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 512
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 180(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 1024
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 176(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 2048
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 172(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 4096
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 168(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 8192
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 164(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 16384
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 160(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 32768
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 156(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 65536
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 152(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 131072
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 148(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 262144
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 144(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui a0, 524288
+; RV32IM-NEXT: and a0, t0, a0
+; RV32IM-NEXT: sw a0, 140(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: andi a0, t0, 2
+; RV32IM-NEXT: andi a1, t0, 1
+; RV32IM-NEXT: andi a2, t0, 4
+; RV32IM-NEXT: andi a3, t0, 8
+; RV32IM-NEXT: andi a4, t0, 16
+; RV32IM-NEXT: andi a5, t0, 32
+; RV32IM-NEXT: andi a6, t0, 64
+; RV32IM-NEXT: andi t1, t0, 128
+; RV32IM-NEXT: andi t2, t0, 256
+; RV32IM-NEXT: andi t3, t0, 512
+; RV32IM-NEXT: andi t0, t0, 1024
+; RV32IM-NEXT: mul a0, a7, a0
+; RV32IM-NEXT: sw a0, 120(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul ra, a7, a1
+; RV32IM-NEXT: mul s11, a7, a2
+; RV32IM-NEXT: mul s8, a7, a3
+; RV32IM-NEXT: mul s7, a7, a4
+; RV32IM-NEXT: mul s4, a7, a5
+; RV32IM-NEXT: mul a0, a7, a6
+; RV32IM-NEXT: sw a0, 124(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a0, a7, t1
+; RV32IM-NEXT: sw a0, 200(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul s1, a7, t2
+; RV32IM-NEXT: mul t3, a7, t3
+; RV32IM-NEXT: mul a0, a7, t0
+; RV32IM-NEXT: sw a0, 116(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a0, a7, s0
+; RV32IM-NEXT: sw a0, 132(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a0, a7, s2
+; RV32IM-NEXT: sw a0, 136(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul t1, a7, s3
+; RV32IM-NEXT: mul a6, a7, s5
+; RV32IM-NEXT: mul s2, a7, s6
+; RV32IM-NEXT: mul s10, a7, s10
+; RV32IM-NEXT: lw a0, 196(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a0, a7, a0
+; RV32IM-NEXT: sw a0, 128(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lw a0, 192(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a0, a7, a0
+; RV32IM-NEXT: sw a0, 196(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lw a0, 188(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a3, a7, a0
+; RV32IM-NEXT: lw a0, 184(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a2, a7, a0
+; RV32IM-NEXT: lw a0, 180(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a5, a7, a0
+; RV32IM-NEXT: lw a0, 176(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul t2, a7, a0
+; RV32IM-NEXT: lw a0, 172(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s6, a7, a0
+; RV32IM-NEXT: lw a0, 168(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a1, a7, a0
+; RV32IM-NEXT: lw a0, 164(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a0, a7, a0
+; RV32IM-NEXT: lw a4, 160(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a4, a7, a4
+; RV32IM-NEXT: lw t0, 156(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul t0, a7, t0
+; RV32IM-NEXT: lw s0, 152(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s0, a7, s0
+; RV32IM-NEXT: lw s3, 148(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s3, a7, s3
+; RV32IM-NEXT: lw s5, 144(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s5, a7, s5
+; RV32IM-NEXT: lw s9, 140(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a7, a7, s9
+; RV32IM-NEXT: lw s9, 120(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor ra, ra, s9
+; RV32IM-NEXT: xor s8, s11, s8
+; RV32IM-NEXT: xor s4, s7, s4
+; RV32IM-NEXT: xor t3, s1, t3
+; RV32IM-NEXT: xor a6, t1, a6
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: xor a0, a1, a0
+; RV32IM-NEXT: xor a1, ra, s8
+; RV32IM-NEXT: lw a3, 124(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, s4, a3
+; RV32IM-NEXT: lw t1, 116(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t1, t3, t1
+; RV32IM-NEXT: xor a6, a6, s2
+; RV32IM-NEXT: xor a2, a2, a5
+; RV32IM-NEXT: xor a0, a0, a4
+; RV32IM-NEXT: xor a1, a1, a3
+; RV32IM-NEXT: lw a3, 132(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, t1, a3
+; RV32IM-NEXT: xor a4, a6, s10
+; RV32IM-NEXT: xor a2, a2, t2
+; RV32IM-NEXT: xor a0, a0, t0
+; RV32IM-NEXT: lw a5, 200(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a1, a1, a5
+; RV32IM-NEXT: lw a5, 136(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, a3, a5
+; RV32IM-NEXT: lw a5, 128(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: xor a2, a2, s6
+; RV32IM-NEXT: xor a0, a0, s0
+; RV32IM-NEXT: lw a5, 196(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: xor a0, a0, s3
+; RV32IM-NEXT: xor a3, a1, a3
+; RV32IM-NEXT: xor a3, a3, a4
+; RV32IM-NEXT: xor a0, a0, s5
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: xor a0, a0, a7
+; RV32IM-NEXT: lw a4, 216(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: and a3, a2, a4
+; RV32IM-NEXT: xor a0, a2, a0
+; RV32IM-NEXT: srli a2, a2, 8
+; RV32IM-NEXT: and a2, a2, a4
+; RV32IM-NEXT: slli a1, a1, 24
+; RV32IM-NEXT: slli a3, a3, 8
+; RV32IM-NEXT: or a1, a1, a3
+; RV32IM-NEXT: srli a0, a0, 24
+; RV32IM-NEXT: or a0, a2, a0
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: srli a1, a0, 4
+; RV32IM-NEXT: and a0, a0, t5
+; RV32IM-NEXT: and a1, a1, t5
+; RV32IM-NEXT: slli a0, a0, 4
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: srli a1, a0, 2
+; RV32IM-NEXT: and a0, a0, t4
+; RV32IM-NEXT: and a1, a1, t4
+; RV32IM-NEXT: slli a0, a0, 2
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: lui a1, 349525
+; RV32IM-NEXT: addi a1, a1, 1364
+; RV32IM-NEXT: and a2, a0, t6
+; RV32IM-NEXT: srli a0, a0, 1
+; RV32IM-NEXT: and a0, a0, a1
+; RV32IM-NEXT: slli a2, a2, 1
+; RV32IM-NEXT: or a0, a0, a2
+; RV32IM-NEXT: srli a0, a0, 1
+; RV32IM-NEXT: lw a1, 204(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a1, a0, a1
+; RV32IM-NEXT: lw a0, 212(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw a2, 208(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a0, a0, a2
+; RV32IM-NEXT: lw ra, 268(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s0, 264(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s1, 260(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s2, 256(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s3, 252(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s4, 248(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s5, 244(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s6, 240(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s7, 236(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s8, 232(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s9, 228(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s10, 224(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s11, 220(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: addi sp, sp, 272
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: clmul_i64:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: addi sp, sp, -448
+; RV64IM-NEXT: sd ra, 440(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s0, 432(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s1, 424(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s2, 416(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s3, 408(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s4, 400(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s5, 392(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s6, 384(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s7, 376(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s8, 368(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s9, 360(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s10, 352(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s11, 344(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: andi t2, a1, 2
+; RV64IM-NEXT: andi t4, a1, 1
+; RV64IM-NEXT: andi a6, a1, 4
+; RV64IM-NEXT: andi t0, a1, 8
+; RV64IM-NEXT: andi a5, a1, 16
+; RV64IM-NEXT: andi a7, a1, 32
+; RV64IM-NEXT: andi a3, a1, 64
+; RV64IM-NEXT: andi t1, a1, 128
+; RV64IM-NEXT: andi t3, a1, 256
+; RV64IM-NEXT: andi a4, a1, 512
+; RV64IM-NEXT: li a2, 1
+; RV64IM-NEXT: lui s7, 1
+; RV64IM-NEXT: lui t6, 2
+; RV64IM-NEXT: lui s0, 4
+; RV64IM-NEXT: lui s1, 8
+; RV64IM-NEXT: lui s2, 16
+; RV64IM-NEXT: lui s3, 32
+; RV64IM-NEXT: lui s4, 64
+; RV64IM-NEXT: lui s5, 128
+; RV64IM-NEXT: lui s6, 256
+; RV64IM-NEXT: lui s8, 512
+; RV64IM-NEXT: lui s9, 1024
+; RV64IM-NEXT: lui s10, 2048
+; RV64IM-NEXT: lui s11, 4096
+; RV64IM-NEXT: lui ra, 8192
+; RV64IM-NEXT: lui t5, 16384
+; RV64IM-NEXT: mul t2, a0, t2
+; RV64IM-NEXT: mul t4, a0, t4
+; RV64IM-NEXT: xor t2, t4, t2
+; RV64IM-NEXT: lui t4, 32768
+; RV64IM-NEXT: mul a6, a0, a6
+; RV64IM-NEXT: mul t0, a0, t0
+; RV64IM-NEXT: xor a6, a6, t0
+; RV64IM-NEXT: lui t0, 65536
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: mul a7, a0, a7
+; RV64IM-NEXT: xor a5, a5, a7
+; RV64IM-NEXT: lui a7, 131072
+; RV64IM-NEXT: mul t1, a0, t1
+; RV64IM-NEXT: mul t3, a0, t3
+; RV64IM-NEXT: xor t1, t1, t3
+; RV64IM-NEXT: lui t3, 262144
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: xor a6, t2, a6
+; RV64IM-NEXT: sd a6, 336(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a6, a2, 11
+; RV64IM-NEXT: sd a6, 216(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and s7, a1, s7
+; RV64IM-NEXT: and a6, a1, t6
+; RV64IM-NEXT: sd a6, 288(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and s0, a1, s0
+; RV64IM-NEXT: and s1, a1, s1
+; RV64IM-NEXT: and s2, a1, s2
+; RV64IM-NEXT: and s3, a1, s3
+; RV64IM-NEXT: and a6, a1, s4
+; RV64IM-NEXT: sd a6, 280(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and a6, a1, s5
+; RV64IM-NEXT: and t2, a1, s6
+; RV64IM-NEXT: and s8, a1, s8
+; RV64IM-NEXT: and t6, a1, s9
+; RV64IM-NEXT: sd t6, 272(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and t6, a1, s10
+; RV64IM-NEXT: sd t6, 264(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and t6, a1, s11
+; RV64IM-NEXT: sd t6, 256(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and t6, a1, ra
+; RV64IM-NEXT: and t5, a1, t5
+; RV64IM-NEXT: and t4, a1, t4
+; RV64IM-NEXT: sd t4, 248(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and t0, a1, t0
+; RV64IM-NEXT: sd t0, 240(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and a7, a1, a7
+; RV64IM-NEXT: sd a7, 232(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and a7, a1, t3
+; RV64IM-NEXT: sd a7, 224(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: xor a3, a5, a3
+; RV64IM-NEXT: sd a3, 328(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli t4, a2, 32
+; RV64IM-NEXT: xor a3, t1, a4
+; RV64IM-NEXT: sd a3, 320(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli s4, a2, 33
+; RV64IM-NEXT: mul a3, a0, s0
+; RV64IM-NEXT: mul a4, a0, s1
+; RV64IM-NEXT: xor a3, a3, a4
+; RV64IM-NEXT: sd a3, 312(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli s0, a2, 34
+; RV64IM-NEXT: mul a3, a0, a6
+; RV64IM-NEXT: mul a4, a0, t2
+; RV64IM-NEXT: xor a3, a3, a4
+; RV64IM-NEXT: sd a3, 304(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli s1, a2, 35
+; RV64IM-NEXT: mul a3, a0, t6
+; RV64IM-NEXT: mul a4, a0, t5
+; RV64IM-NEXT: xor a3, a3, a4
+; RV64IM-NEXT: sd a3, 296(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli t5, a2, 36
+; RV64IM-NEXT: slli t6, a2, 37
+; RV64IM-NEXT: slli s5, a2, 38
+; RV64IM-NEXT: slli s6, a2, 39
+; RV64IM-NEXT: slli s9, a2, 40
+; RV64IM-NEXT: slli s10, a2, 41
+; RV64IM-NEXT: slli s11, a2, 42
+; RV64IM-NEXT: slli ra, a2, 43
+; RV64IM-NEXT: slli a3, a2, 44
+; RV64IM-NEXT: sd a3, 208(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a3, a2, 45
+; RV64IM-NEXT: sd a3, 200(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a3, a2, 46
+; RV64IM-NEXT: sd a3, 192(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a3, a2, 47
+; RV64IM-NEXT: sd a3, 184(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a3, a2, 48
+; RV64IM-NEXT: sd a3, 176(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a3, a2, 49
+; RV64IM-NEXT: sd a3, 168(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a3, a2, 50
+; RV64IM-NEXT: sd a3, 160(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a3, a2, 51
+; RV64IM-NEXT: sd a3, 152(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a3, a2, 52
+; RV64IM-NEXT: sd a3, 144(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a3, a2, 53
+; RV64IM-NEXT: sd a3, 136(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a3, a2, 54
+; RV64IM-NEXT: sd a3, 128(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli t1, a2, 55
+; RV64IM-NEXT: slli t0, a2, 56
+; RV64IM-NEXT: slli a7, a2, 57
+; RV64IM-NEXT: slli a6, a2, 58
+; RV64IM-NEXT: slli a5, a2, 59
+; RV64IM-NEXT: slli a4, a2, 60
+; RV64IM-NEXT: slli a3, a2, 61
+; RV64IM-NEXT: slli a2, a2, 62
+; RV64IM-NEXT: ld t2, 216(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and t3, a1, t2
+; RV64IM-NEXT: and t2, a1, t4
+; RV64IM-NEXT: sd t2, 120(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and t4, a1, s4
+; RV64IM-NEXT: and s0, a1, s0
+; RV64IM-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and s1, a1, s1
+; RV64IM-NEXT: sd s1, 104(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and t2, a1, t5
+; RV64IM-NEXT: sd t2, 96(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and s1, a1, t6
+; RV64IM-NEXT: and t2, a1, s5
+; RV64IM-NEXT: sd t2, 88(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and t2, a1, s6
+; RV64IM-NEXT: sd t2, 80(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and s4, a1, s9
+; RV64IM-NEXT: and s5, a1, s10
+; RV64IM-NEXT: and s6, a1, s11
+; RV64IM-NEXT: and t6, a1, ra
+; RV64IM-NEXT: ld t2, 208(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: sd t2, 72(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld t2, 200(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: sd t2, 64(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld t2, 192(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and s10, a1, t2
+; RV64IM-NEXT: ld t2, 184(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and s11, a1, t2
+; RV64IM-NEXT: ld t2, 176(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and ra, a1, t2
+; RV64IM-NEXT: ld t2, 168(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: sd t2, 184(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld t2, 160(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: sd t2, 160(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld t2, 152(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: sd t2, 152(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld t2, 144(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: sd t2, 144(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld t2, 136(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: sd t2, 136(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld t2, 128(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: sd t2, 128(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and t1, a1, t1
+; RV64IM-NEXT: and t0, a1, t0
+; RV64IM-NEXT: sd t0, 56(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and a7, a1, a7
+; RV64IM-NEXT: sd a7, 48(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and a6, a1, a6
+; RV64IM-NEXT: and a5, a1, a5
+; RV64IM-NEXT: sd a5, 40(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and a4, a1, a4
+; RV64IM-NEXT: sd a4, 32(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and a3, a1, a3
+; RV64IM-NEXT: sd a3, 24(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: and a2, a1, a2
+; RV64IM-NEXT: sd a2, 16(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: andi a2, a1, 1024
+; RV64IM-NEXT: srliw a3, a1, 31
+; RV64IM-NEXT: srli a1, a1, 63
+; RV64IM-NEXT: mul s9, a0, a2
+; RV64IM-NEXT: slli a3, a3, 31
+; RV64IM-NEXT: slli a1, a1, 63
+; RV64IM-NEXT: mul s7, a0, s7
+; RV64IM-NEXT: ld a2, 288(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: sd a2, 192(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: mul s2, a0, s2
+; RV64IM-NEXT: mul a2, a0, s3
+; RV64IM-NEXT: sd a2, 176(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a2, 280(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: sd a2, 216(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: mul s0, a0, s8
+; RV64IM-NEXT: ld a2, 272(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: sd a2, 8(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a2, 264(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: sd a2, 208(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a2, 256(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: sd a2, 272(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a2, 248(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul t2, a0, a2
+; RV64IM-NEXT: ld a2, 240(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: sd a2, 0(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a2, 232(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: sd a2, 200(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a2, 224(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: sd a2, 256(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: mul a2, a0, a3
+; RV64IM-NEXT: sd a2, 280(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: sd a1, 288(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: mul a5, a0, t3
+; RV64IM-NEXT: ld a1, 120(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a7, a0, a1
+; RV64IM-NEXT: mul t4, a0, t4
+; RV64IM-NEXT: ld a1, 112(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul t5, a0, a1
+; RV64IM-NEXT: ld a1, 104(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: sd a1, 120(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a1, 96(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: sd a1, 224(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: mul a1, a0, s1
+; RV64IM-NEXT: sd a1, 240(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a1, 88(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: sd a1, 264(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a1, 80(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul s1, a0, a1
+; RV64IM-NEXT: mul s4, a0, s4
+; RV64IM-NEXT: mul s5, a0, s5
+; RV64IM-NEXT: mul s6, a0, s6
+; RV64IM-NEXT: mul a1, a0, t6
+; RV64IM-NEXT: sd a1, 112(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a1, 72(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: sd a1, 168(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a1, 64(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: sd a1, 232(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: mul a1, a0, s10
+; RV64IM-NEXT: sd a1, 248(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: mul s10, a0, s11
+; RV64IM-NEXT: mul s11, a0, ra
+; RV64IM-NEXT: ld a1, 184(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul ra, a0, a1
+; RV64IM-NEXT: ld a1, 160(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul t0, a0, a1
+; RV64IM-NEXT: ld a1, 152(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul t3, a0, a1
+; RV64IM-NEXT: ld a1, 144(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul s3, a0, a1
+; RV64IM-NEXT: ld a1, 136(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: sd a1, 152(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a1, 128(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: sd a1, 160(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: mul a1, a0, t1
+; RV64IM-NEXT: sd a1, 184(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: ld a1, 56(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a2, a0, a1
+; RV64IM-NEXT: ld a1, 48(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: mul a3, a0, a6
+; RV64IM-NEXT: ld a4, 40(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: ld a6, 32(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a6, a0, a6
+; RV64IM-NEXT: ld t1, 24(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul t1, a0, t1
+; RV64IM-NEXT: ld t6, 16(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul t6, a0, t6
+; RV64IM-NEXT: ld a0, 336(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s8, 328(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a0, a0, s8
+; RV64IM-NEXT: ld s8, 320(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor s9, s8, s9
+; RV64IM-NEXT: xor a5, a5, s7
+; RV64IM-NEXT: ld s7, 312(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor s2, s7, s2
+; RV64IM-NEXT: ld s7, 304(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor s0, s7, s0
+; RV64IM-NEXT: ld s7, 296(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor t2, s7, t2
+; RV64IM-NEXT: xor a7, a7, t4
+; RV64IM-NEXT: xor t4, s1, s4
+; RV64IM-NEXT: xor s1, s10, s11
+; RV64IM-NEXT: xor a1, a2, a1
+; RV64IM-NEXT: xor a0, a0, s9
+; RV64IM-NEXT: ld a2, 192(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a2, a5, a2
+; RV64IM-NEXT: ld a5, 176(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a5, s2, a5
+; RV64IM-NEXT: ld s2, 8(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor s0, s0, s2
+; RV64IM-NEXT: ld s2, 0(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor t2, t2, s2
+; RV64IM-NEXT: xor a7, a7, t5
+; RV64IM-NEXT: xor t4, t4, s5
+; RV64IM-NEXT: xor t5, s1, ra
+; RV64IM-NEXT: xor a1, a1, a3
+; RV64IM-NEXT: xor a0, a0, a2
+; RV64IM-NEXT: ld a2, 216(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a2, a5, a2
+; RV64IM-NEXT: ld a3, 208(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a3, s0, a3
+; RV64IM-NEXT: ld a5, 200(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a5, t2, a5
+; RV64IM-NEXT: ld t2, 120(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a7, a7, t2
+; RV64IM-NEXT: xor t2, t4, s6
+; RV64IM-NEXT: xor t0, t5, t0
+; RV64IM-NEXT: xor a1, a1, a4
+; RV64IM-NEXT: xor a0, a0, a2
+; RV64IM-NEXT: ld a2, 272(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a2, a3, a2
+; RV64IM-NEXT: ld a3, 256(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a3, a5, a3
+; RV64IM-NEXT: ld a4, 224(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a4, a7, a4
+; RV64IM-NEXT: ld a5, 112(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a5, t2, a5
+; RV64IM-NEXT: xor a7, t0, t3
+; RV64IM-NEXT: xor a1, a1, a6
+; RV64IM-NEXT: xor a0, a0, a2
+; RV64IM-NEXT: ld a2, 280(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a2, a3, a2
+; RV64IM-NEXT: ld a3, 240(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a3, a4, a3
+; RV64IM-NEXT: ld a4, 168(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a4, a5, a4
+; RV64IM-NEXT: xor a5, a7, s3
+; RV64IM-NEXT: xor a1, a1, t1
+; RV64IM-NEXT: xor a0, a0, a2
+; RV64IM-NEXT: ld a2, 264(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a2, a3, a2
+; RV64IM-NEXT: ld a3, 232(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a3, a4, a3
+; RV64IM-NEXT: ld a4, 152(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a4, a5, a4
+; RV64IM-NEXT: xor a1, a1, t6
+; RV64IM-NEXT: xor a0, a0, a2
+; RV64IM-NEXT: ld a2, 248(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a2, a3, a2
+; RV64IM-NEXT: ld a3, 160(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a3, a4, a3
+; RV64IM-NEXT: xor a0, a0, a2
+; RV64IM-NEXT: ld a2, 184(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a2, a3, a2
+; RV64IM-NEXT: xor a0, a0, a2
+; RV64IM-NEXT: ld a2, 288(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor a1, a1, a2
+; RV64IM-NEXT: xor a0, a0, a1
+; RV64IM-NEXT: ld ra, 440(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s0, 432(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s1, 424(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s2, 416(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s3, 408(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s4, 400(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s5, 392(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s6, 384(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s7, 376(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s8, 368(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s9, 360(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s10, 352(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s11, 344(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: addi sp, sp, 448
+; RV64IM-NEXT: ret
+ %res = call i64 @llvm.clmul.i64(i64 %a, i64 %b)
+ ret i64 %res
+}
+
+define i4 @clmul_constfold_i4() nounwind {
+; CHECK-LABEL: clmul_constfold_i4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: li a0, 2
+; CHECK-NEXT: ret
+ %res = call i4 @llvm.clmul.i4(i4 1, i4 2)
+ ret i4 %res
+}
+
+define i16 @clmul_constfold_i16() nounwind {
+; CHECK-LABEL: clmul_constfold_i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: lui a0, 11
+; CHECK-NEXT: addi a0, a0, -1366
+; CHECK-NEXT: ret
+ %res = call i16 @llvm.clmul.i16(i16 -2, i16 -1)
+ ret i16 %res
+}
+
+define i4 @clmulr_i4(i4 %a, i4 %b) nounwind {
+; CHECK-LABEL: clmulr_i4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 15
+; CHECK-NEXT: andi a2, a1, 2
+; CHECK-NEXT: andi a3, a1, 1
+; CHECK-NEXT: andi a4, a1, 4
+; CHECK-NEXT: andi a1, a1, 8
+; CHECK-NEXT: mul a2, a0, a2
+; CHECK-NEXT: mul a3, a0, a3
+; CHECK-NEXT: mul a4, a0, a4
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: xor a2, a3, a2
+; CHECK-NEXT: xor a0, a4, a0
+; CHECK-NEXT: xor a0, a2, a0
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: ret
+ %a.ext = zext i4 %a to i8
+ %b.ext = zext i4 %b to i8
+ %clmul = call i8 @llvm.clmul.i8(i8 %a.ext, i8 %b.ext)
+ %res.ext = lshr i8 %clmul, 3
+ %res = trunc i8 %res.ext to i4
+ ret i4 %res
+}
+
+define i4 @clmulr_i4_bitreverse(i4 %a, i4 %b) nounwind {
+; CHECK-LABEL: clmulr_i4_bitreverse:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 15
+; CHECK-NEXT: andi a2, a1, 2
+; CHECK-NEXT: andi a3, a1, 1
+; CHECK-NEXT: andi a4, a1, 4
+; CHECK-NEXT: andi a1, a1, 8
+; CHECK-NEXT: mul a2, a0, a2
+; CHECK-NEXT: mul a3, a0, a3
+; CHECK-NEXT: mul a4, a0, a4
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: xor a2, a3, a2
+; CHECK-NEXT: xor a0, a4, a0
+; CHECK-NEXT: xor a0, a2, a0
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: ret
+ %a.rev = call i4 @llvm.bitreverse.i4(i4 %a)
+ %b.rev = call i4 @llvm.bitreverse.i4(i4 %b)
+ %res.rev = call i4 @llvm.clmul.i4(i4 %a.rev, i4 %b.rev)
+ %res = call i4 @llvm.bitreverse.i4(i4 %res.rev)
+ ret i4 %res
+}
+
+define i8 @clmulr_i8(i8 %a, i8 %b) nounwind {
+; RV32IM-LABEL: clmulr_i8:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: zext.b a0, a0
+; RV32IM-NEXT: andi a2, a1, 2
+; RV32IM-NEXT: andi a3, a1, 1
+; RV32IM-NEXT: andi a4, a1, 4
+; RV32IM-NEXT: andi a5, a1, 8
+; RV32IM-NEXT: mul a2, a0, a2
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: andi a3, a1, 16
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: andi a5, a1, 32
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a3, a3, a5
+; RV32IM-NEXT: xor a2, a2, a4
+; RV32IM-NEXT: andi a4, a1, 64
+; RV32IM-NEXT: andi a1, a1, 128
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: xor a3, a3, a4
+; RV32IM-NEXT: xor a2, a2, a3
+; RV32IM-NEXT: mul a0, a0, a1
+; RV32IM-NEXT: xor a0, a2, a0
+; RV32IM-NEXT: slli a0, a0, 17
+; RV32IM-NEXT: srli a0, a0, 24
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: clmulr_i8:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: zext.b a0, a0
+; RV64IM-NEXT: andi a2, a1, 2
+; RV64IM-NEXT: andi a3, a1, 1
+; RV64IM-NEXT: andi a4, a1, 4
+; RV64IM-NEXT: andi a5, a1, 8
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: xor a2, a3, a2
+; RV64IM-NEXT: andi a3, a1, 16
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: xor a4, a4, a5
+; RV64IM-NEXT: andi a5, a1, 32
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: xor a3, a3, a5
+; RV64IM-NEXT: xor a2, a2, a4
+; RV64IM-NEXT: andi a4, a1, 64
+; RV64IM-NEXT: andi a1, a1, 128
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: xor a3, a3, a4
+; RV64IM-NEXT: xor a2, a2, a3
+; RV64IM-NEXT: mul a0, a0, a1
+; RV64IM-NEXT: xor a0, a2, a0
+; RV64IM-NEXT: slli a0, a0, 49
+; RV64IM-NEXT: srli a0, a0, 56
+; RV64IM-NEXT: ret
+ %a.ext = zext i8 %a to i16
+ %b.ext = zext i8 %b to i16
+ %clmul = call i16 @llvm.clmul.i16(i16 %a.ext, i16 %b.ext)
+ %res.ext = lshr i16 %clmul, 7
+ %res = trunc i16 %res.ext to i8
+ ret i8 %res
+}
+
+define i16 @clmulr_i16(i16 %a, i16 %b) nounwind {
+; RV32IM-LABEL: clmulr_i16:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: andi a2, a1, 2
+; RV32IM-NEXT: andi a3, a1, 1
+; RV32IM-NEXT: andi a4, a1, 4
+; RV32IM-NEXT: andi a5, a1, 8
+; RV32IM-NEXT: andi a6, a1, 16
+; RV32IM-NEXT: andi a7, a1, 32
+; RV32IM-NEXT: srli a0, a0, 16
+; RV32IM-NEXT: mul a2, a0, a2
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: andi a3, a1, 64
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: andi a5, a1, 128
+; RV32IM-NEXT: mul a6, a0, a6
+; RV32IM-NEXT: mul a7, a0, a7
+; RV32IM-NEXT: xor a6, a6, a7
+; RV32IM-NEXT: andi a7, a1, 256
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: mul a7, a0, a7
+; RV32IM-NEXT: xor a5, a5, a7
+; RV32IM-NEXT: andi a7, a1, 512
+; RV32IM-NEXT: xor a2, a2, a4
+; RV32IM-NEXT: li a4, 1
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: xor a3, a6, a3
+; RV32IM-NEXT: lui a6, 1
+; RV32IM-NEXT: mul a7, a0, a7
+; RV32IM-NEXT: xor a5, a5, a7
+; RV32IM-NEXT: lui a7, 2
+; RV32IM-NEXT: slli a4, a4, 11
+; RV32IM-NEXT: and a6, a1, a6
+; RV32IM-NEXT: and a4, a1, a4
+; RV32IM-NEXT: mul a6, a0, a6
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: xor a4, a4, a6
+; RV32IM-NEXT: lui a6, 4
+; RV32IM-NEXT: xor a2, a2, a3
+; RV32IM-NEXT: lui a3, 8
+; RV32IM-NEXT: and a7, a1, a7
+; RV32IM-NEXT: and a6, a1, a6
+; RV32IM-NEXT: and a3, a1, a3
+; RV32IM-NEXT: andi a1, a1, 1024
+; RV32IM-NEXT: mul a1, a0, a1
+; RV32IM-NEXT: xor a1, a5, a1
+; RV32IM-NEXT: mul a5, a0, a7
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: xor a1, a2, a1
+; RV32IM-NEXT: mul a2, a0, a6
+; RV32IM-NEXT: xor a2, a4, a2
+; RV32IM-NEXT: xor a1, a1, a2
+; RV32IM-NEXT: mul a0, a0, a3
+; RV32IM-NEXT: xor a0, a1, a0
+; RV32IM-NEXT: slli a0, a0, 1
+; RV32IM-NEXT: srli a0, a0, 16
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: clmulr_i16:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: andi a2, a1, 2
+; RV64IM-NEXT: andi a3, a1, 1
+; RV64IM-NEXT: andi a4, a1, 4
+; RV64IM-NEXT: andi a5, a1, 8
+; RV64IM-NEXT: andi a6, a1, 16
+; RV64IM-NEXT: andi a7, a1, 32
+; RV64IM-NEXT: srli a0, a0, 48
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: xor a2, a3, a2
+; RV64IM-NEXT: andi a3, a1, 64
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: xor a4, a4, a5
+; RV64IM-NEXT: andi a5, a1, 128
+; RV64IM-NEXT: mul a6, a0, a6
+; RV64IM-NEXT: mul a7, a0, a7
+; RV64IM-NEXT: xor a6, a6, a7
+; RV64IM-NEXT: andi a7, a1, 256
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: mul a7, a0, a7
+; RV64IM-NEXT: xor a5, a5, a7
+; RV64IM-NEXT: andi a7, a1, 512
+; RV64IM-NEXT: xor a2, a2, a4
+; RV64IM-NEXT: li a4, 1
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: xor a3, a6, a3
+; RV64IM-NEXT: lui a6, 1
+; RV64IM-NEXT: mul a7, a0, a7
+; RV64IM-NEXT: xor a5, a5, a7
+; RV64IM-NEXT: lui a7, 2
+; RV64IM-NEXT: slli a4, a4, 11
+; RV64IM-NEXT: and a6, a1, a6
+; RV64IM-NEXT: and a4, a1, a4
+; RV64IM-NEXT: mul a6, a0, a6
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: xor a4, a4, a6
+; RV64IM-NEXT: lui a6, 4
+; RV64IM-NEXT: xor a2, a2, a3
+; RV64IM-NEXT: lui a3, 8
+; RV64IM-NEXT: and a7, a1, a7
+; RV64IM-NEXT: and a6, a1, a6
+; RV64IM-NEXT: and a3, a1, a3
+; RV64IM-NEXT: andi a1, a1, 1024
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: xor a1, a5, a1
+; RV64IM-NEXT: mul a5, a0, a7
+; RV64IM-NEXT: xor a4, a4, a5
+; RV64IM-NEXT: xor a1, a2, a1
+; RV64IM-NEXT: mul a2, a0, a6
+; RV64IM-NEXT: xor a2, a4, a2
+; RV64IM-NEXT: xor a1, a1, a2
+; RV64IM-NEXT: mul a0, a0, a3
+; RV64IM-NEXT: xor a0, a1, a0
+; RV64IM-NEXT: slli a0, a0, 33
+; RV64IM-NEXT: srli a0, a0, 48
+; RV64IM-NEXT: ret
+ %a.ext = zext i16 %a to i32
+ %b.ext = zext i16 %b to i32
+ %clmul = call i32 @llvm.clmul.i32(i32 %a.ext, i32 %b.ext)
+ %res.ext = lshr i32 %clmul, 15
+ %res = trunc i32 %res.ext to i16
+ ret i16 %res
+}
+
+define i32 @clmulr_i32(i32 %a, i32 %b) nounwind {
+; RV32IM-LABEL: clmulr_i32:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: addi sp, sp, -144
+; RV32IM-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: srli t0, a0, 8
+; RV32IM-NEXT: lui a3, 16
+; RV32IM-NEXT: srli t1, a0, 24
+; RV32IM-NEXT: slli a2, a0, 24
+; RV32IM-NEXT: lui t3, 61681
+; RV32IM-NEXT: lui t5, 209715
+; RV32IM-NEXT: lui t6, 349525
+; RV32IM-NEXT: srli t4, a1, 8
+; RV32IM-NEXT: srli a4, a1, 24
+; RV32IM-NEXT: slli a5, a1, 24
+; RV32IM-NEXT: li s7, 1
+; RV32IM-NEXT: lui t2, 4
+; RV32IM-NEXT: lui s0, 8
+; RV32IM-NEXT: lui s1, 32
+; RV32IM-NEXT: lui s2, 64
+; RV32IM-NEXT: lui s3, 128
+; RV32IM-NEXT: lui s4, 256
+; RV32IM-NEXT: lui s8, 512
+; RV32IM-NEXT: lui a7, 1024
+; RV32IM-NEXT: lui s9, 2048
+; RV32IM-NEXT: lui s10, 4096
+; RV32IM-NEXT: lui s11, 8192
+; RV32IM-NEXT: lui ra, 16384
+; RV32IM-NEXT: addi s5, a3, -256
+; RV32IM-NEXT: and t0, t0, s5
+; RV32IM-NEXT: or t1, t0, t1
+; RV32IM-NEXT: lui a6, 32768
+; RV32IM-NEXT: and t4, t4, s5
+; RV32IM-NEXT: or a4, t4, a4
+; RV32IM-NEXT: lui t0, 65536
+; RV32IM-NEXT: and a0, a0, s5
+; RV32IM-NEXT: slli a0, a0, 8
+; RV32IM-NEXT: or a0, a2, a0
+; RV32IM-NEXT: lui a2, 131072
+; RV32IM-NEXT: and a1, a1, s5
+; RV32IM-NEXT: slli a1, a1, 8
+; RV32IM-NEXT: or t4, a5, a1
+; RV32IM-NEXT: lui a1, 262144
+; RV32IM-NEXT: or a0, a0, t1
+; RV32IM-NEXT: lui a5, 524288
+; RV32IM-NEXT: addi t3, t3, -241
+; RV32IM-NEXT: addi t5, t5, 819
+; RV32IM-NEXT: addi t6, t6, 1365
+; RV32IM-NEXT: slli s7, s7, 11
+; RV32IM-NEXT: or a4, t4, a4
+; RV32IM-NEXT: srli t4, a0, 4
+; RV32IM-NEXT: and a0, a0, t3
+; RV32IM-NEXT: and t4, t4, t3
+; RV32IM-NEXT: slli a0, a0, 4
+; RV32IM-NEXT: or a0, t4, a0
+; RV32IM-NEXT: srli t4, a4, 4
+; RV32IM-NEXT: and a4, a4, t3
+; RV32IM-NEXT: and t4, t4, t3
+; RV32IM-NEXT: slli a4, a4, 4
+; RV32IM-NEXT: or a4, t4, a4
+; RV32IM-NEXT: srli t4, a0, 2
+; RV32IM-NEXT: and a0, a0, t5
+; RV32IM-NEXT: and t4, t4, t5
+; RV32IM-NEXT: slli a0, a0, 2
+; RV32IM-NEXT: or a0, t4, a0
+; RV32IM-NEXT: srli t4, a4, 2
+; RV32IM-NEXT: and a4, a4, t5
+; RV32IM-NEXT: and t4, t4, t5
+; RV32IM-NEXT: slli a4, a4, 2
+; RV32IM-NEXT: or t4, t4, a4
+; RV32IM-NEXT: srli a4, a0, 1
+; RV32IM-NEXT: and a0, a0, t6
+; RV32IM-NEXT: and a4, a4, t6
+; RV32IM-NEXT: slli a0, a0, 1
+; RV32IM-NEXT: or a4, a4, a0
+; RV32IM-NEXT: srli a0, t4, 1
+; RV32IM-NEXT: and t4, t4, t6
+; RV32IM-NEXT: and a0, a0, t6
+; RV32IM-NEXT: slli t4, t4, 1
+; RV32IM-NEXT: or a0, a0, t4
+; RV32IM-NEXT: andi t4, a0, 2
+; RV32IM-NEXT: and s6, a0, s7
+; RV32IM-NEXT: lui t1, 1
+; RV32IM-NEXT: and t1, a0, t1
+; RV32IM-NEXT: sw t1, 84(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lui t1, 2
+; RV32IM-NEXT: and t1, a0, t1
+; RV32IM-NEXT: sw t1, 80(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and t1, a0, t2
+; RV32IM-NEXT: sw t1, 76(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and s0, a0, s0
+; RV32IM-NEXT: and a3, a0, a3
+; RV32IM-NEXT: sw a3, 72(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and s1, a0, s1
+; RV32IM-NEXT: sw s1, 68(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a0, s2
+; RV32IM-NEXT: sw a3, 64(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and s3, a0, s3
+; RV32IM-NEXT: and a3, a0, s4
+; RV32IM-NEXT: sw a3, 60(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a0, s8
+; RV32IM-NEXT: sw a3, 56(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a0, a7
+; RV32IM-NEXT: sw a3, 52(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and s9, a0, s9
+; RV32IM-NEXT: and a3, a0, s10
+; RV32IM-NEXT: sw a3, 48(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a0, s11
+; RV32IM-NEXT: sw a3, 44(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a0, ra
+; RV32IM-NEXT: sw a3, 40(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a0, a6
+; RV32IM-NEXT: sw a3, 36(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a0, t0
+; RV32IM-NEXT: sw a3, 32(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a2, a0, a2
+; RV32IM-NEXT: sw a2, 28(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a1, a0, a1
+; RV32IM-NEXT: sw a1, 24(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a5, a0, a5
+; RV32IM-NEXT: sw a5, 20(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: andi a1, a0, 1
+; RV32IM-NEXT: andi a2, a0, 4
+; RV32IM-NEXT: andi a3, a0, 8
+; RV32IM-NEXT: andi a5, a0, 16
+; RV32IM-NEXT: andi a6, a0, 32
+; RV32IM-NEXT: andi a7, a0, 64
+; RV32IM-NEXT: andi t0, a0, 128
+; RV32IM-NEXT: andi t1, a0, 256
+; RV32IM-NEXT: andi t2, a0, 512
+; RV32IM-NEXT: andi a0, a0, 1024
+; RV32IM-NEXT: mul t4, a4, t4
+; RV32IM-NEXT: sw t4, 8(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul ra, a4, a1
+; RV32IM-NEXT: mul s11, a4, a2
+; RV32IM-NEXT: mul s8, a4, a3
+; RV32IM-NEXT: mul s7, a4, a5
+; RV32IM-NEXT: mul s4, a4, a6
+; RV32IM-NEXT: mul a1, a4, a7
+; RV32IM-NEXT: sw a1, 12(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a4, t0
+; RV32IM-NEXT: sw a1, 88(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul s2, a4, t1
+; RV32IM-NEXT: mul t2, a4, t2
+; RV32IM-NEXT: mul a0, a4, a0
+; RV32IM-NEXT: sw a0, 4(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a0, a4, s6
+; RV32IM-NEXT: sw a0, 16(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lw a0, 84(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a0, a4, a0
+; RV32IM-NEXT: sw a0, 84(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lw a0, 80(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul t1, a4, a0
+; RV32IM-NEXT: lw a0, 76(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a7, a4, a0
+; RV32IM-NEXT: mul s1, a4, s0
+; RV32IM-NEXT: lw a0, 72(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a0, a4, a0
+; RV32IM-NEXT: sw a0, 72(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lw a0, 68(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a0, a4, a0
+; RV32IM-NEXT: sw a0, 76(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lw a0, 64(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a0, a4, a0
+; RV32IM-NEXT: sw a0, 80(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a3, a4, s3
+; RV32IM-NEXT: lw a0, 60(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a2, a4, a0
+; RV32IM-NEXT: lw a0, 56(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a6, a4, a0
+; RV32IM-NEXT: lw a0, 52(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul t4, a4, a0
+; RV32IM-NEXT: mul s6, a4, s9
+; RV32IM-NEXT: lw a0, 48(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a1, a4, a0
+; RV32IM-NEXT: lw a0, 44(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a0, a4, a0
+; RV32IM-NEXT: lw a5, 40(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a5, a4, a5
+; RV32IM-NEXT: lw t0, 36(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul t0, a4, t0
+; RV32IM-NEXT: lw s0, 32(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s0, a4, s0
+; RV32IM-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s3, a4, s3
+; RV32IM-NEXT: lw s9, 24(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s9, a4, s9
+; RV32IM-NEXT: lw s10, 20(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a4, a4, s10
+; RV32IM-NEXT: lw s10, 8(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor ra, ra, s10
+; RV32IM-NEXT: xor s8, s11, s8
+; RV32IM-NEXT: xor s4, s7, s4
+; RV32IM-NEXT: xor t2, s2, t2
+; RV32IM-NEXT: xor a7, t1, a7
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: xor a0, a1, a0
+; RV32IM-NEXT: xor a1, ra, s8
+; RV32IM-NEXT: lw a3, 12(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, s4, a3
+; RV32IM-NEXT: lw t1, 4(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor t1, t2, t1
+; RV32IM-NEXT: xor a7, a7, s1
+; RV32IM-NEXT: xor a2, a2, a6
+; RV32IM-NEXT: xor a0, a0, a5
+; RV32IM-NEXT: xor a1, a1, a3
+; RV32IM-NEXT: lw a3, 16(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, t1, a3
+; RV32IM-NEXT: lw a5, 72(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a5, a7, a5
+; RV32IM-NEXT: xor a2, a2, t4
+; RV32IM-NEXT: xor a0, a0, t0
+; RV32IM-NEXT: lw a6, 88(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a1, a1, a6
+; RV32IM-NEXT: lw a6, 84(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, a3, a6
+; RV32IM-NEXT: lw a6, 76(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a5, a5, a6
+; RV32IM-NEXT: xor a2, a2, s6
+; RV32IM-NEXT: xor a0, a0, s0
+; RV32IM-NEXT: lw a6, 80(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a5, a5, a6
+; RV32IM-NEXT: xor a0, a0, s3
+; RV32IM-NEXT: xor a3, a1, a3
+; RV32IM-NEXT: slli a1, a1, 24
+; RV32IM-NEXT: xor a3, a3, a5
+; RV32IM-NEXT: xor a0, a0, s9
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: xor a0, a0, a4
+; RV32IM-NEXT: and a3, a2, s5
+; RV32IM-NEXT: srli a4, a2, 8
+; RV32IM-NEXT: xor a0, a2, a0
+; RV32IM-NEXT: slli a3, a3, 8
+; RV32IM-NEXT: and a2, a4, s5
+; RV32IM-NEXT: srli a0, a0, 24
+; RV32IM-NEXT: or a1, a1, a3
+; RV32IM-NEXT: or a0, a2, a0
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: srli a1, a0, 4
+; RV32IM-NEXT: and a0, a0, t3
+; RV32IM-NEXT: and a1, a1, t3
+; RV32IM-NEXT: slli a0, a0, 4
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: srli a1, a0, 2
+; RV32IM-NEXT: and a0, a0, t5
+; RV32IM-NEXT: and a1, a1, t5
+; RV32IM-NEXT: slli a0, a0, 2
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: srli a1, a0, 1
+; RV32IM-NEXT: and a0, a0, t6
+; RV32IM-NEXT: and a1, a1, t6
+; RV32IM-NEXT: slli a0, a0, 1
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: addi sp, sp, 144
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: clmulr_i32:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: addi sp, sp, -128
+; RV64IM-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s1, 104(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s2, 96(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s3, 88(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s4, 80(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s5, 72(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s6, 64(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s7, 56(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s8, 48(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s9, 40(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s10, 32(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s11, 24(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a6, a0, 32
+; RV64IM-NEXT: andi t1, a1, 2
+; RV64IM-NEXT: andi t3, a1, 1
+; RV64IM-NEXT: andi a5, a1, 4
+; RV64IM-NEXT: andi a7, a1, 8
+; RV64IM-NEXT: andi a3, a1, 16
+; RV64IM-NEXT: andi a4, a1, 32
+; RV64IM-NEXT: andi a0, a1, 64
+; RV64IM-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: andi t0, a1, 128
+; RV64IM-NEXT: andi t2, a1, 256
+; RV64IM-NEXT: andi a0, a1, 512
+; RV64IM-NEXT: sd a0, 8(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: li a2, 1
+; RV64IM-NEXT: lui t5, 1
+; RV64IM-NEXT: lui t6, 2
+; RV64IM-NEXT: lui s0, 4
+; RV64IM-NEXT: lui s2, 8
+; RV64IM-NEXT: lui s3, 16
+; RV64IM-NEXT: lui s4, 32
+; RV64IM-NEXT: lui s5, 64
+; RV64IM-NEXT: lui s6, 128
+; RV64IM-NEXT: lui s7, 256
+; RV64IM-NEXT: lui s8, 512
+; RV64IM-NEXT: lui s9, 1024
+; RV64IM-NEXT: lui s10, 2048
+; RV64IM-NEXT: lui s11, 4096
+; RV64IM-NEXT: lui ra, 8192
+; RV64IM-NEXT: lui a0, 16384
+; RV64IM-NEXT: srli s1, a6, 32
+; RV64IM-NEXT: mul a6, s1, t1
+; RV64IM-NEXT: mul t1, s1, t3
+; RV64IM-NEXT: xor a6, t1, a6
+; RV64IM-NEXT: sd a6, 0(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: lui t1, 32768
+; RV64IM-NEXT: mul a5, s1, a5
+; RV64IM-NEXT: mul a7, s1, a7
+; RV64IM-NEXT: xor t4, a5, a7
+; RV64IM-NEXT: lui a7, 65536
+; RV64IM-NEXT: mul a3, s1, a3
+; RV64IM-NEXT: mul a4, s1, a4
+; RV64IM-NEXT: xor a6, a3, a4
+; RV64IM-NEXT: lui t3, 131072
+; RV64IM-NEXT: mul a4, s1, t0
+; RV64IM-NEXT: mul t0, s1, t2
+; RV64IM-NEXT: xor a5, a4, t0
+; RV64IM-NEXT: lui t0, 262144
+; RV64IM-NEXT: slli t2, a2, 11
+; RV64IM-NEXT: and t5, a1, t5
+; RV64IM-NEXT: and t6, a1, t6
+; RV64IM-NEXT: and s0, a1, s0
+; RV64IM-NEXT: and s2, a1, s2
+; RV64IM-NEXT: and s3, a1, s3
+; RV64IM-NEXT: and s4, a1, s4
+; RV64IM-NEXT: and s5, a1, s5
+; RV64IM-NEXT: and s6, a1, s6
+; RV64IM-NEXT: and s7, a1, s7
+; RV64IM-NEXT: and s8, a1, s8
+; RV64IM-NEXT: and s9, a1, s9
+; RV64IM-NEXT: and s10, a1, s10
+; RV64IM-NEXT: and s11, a1, s11
+; RV64IM-NEXT: and ra, a1, ra
+; RV64IM-NEXT: and a2, a1, a0
+; RV64IM-NEXT: and t1, a1, t1
+; RV64IM-NEXT: and a7, a1, a7
+; RV64IM-NEXT: and t3, a1, t3
+; RV64IM-NEXT: and t0, a1, t0
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: andi a0, a1, 1024
+; RV64IM-NEXT: srliw a1, a1, 31
+; RV64IM-NEXT: slli a1, a1, 31
+; RV64IM-NEXT: ld a3, 16(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a3, s1, a3
+; RV64IM-NEXT: ld a4, 8(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a4, s1, a4
+; RV64IM-NEXT: mul a0, s1, a0
+; RV64IM-NEXT: mul t5, s1, t5
+; RV64IM-NEXT: mul t6, s1, t6
+; RV64IM-NEXT: mul s0, s1, s0
+; RV64IM-NEXT: mul s2, s1, s2
+; RV64IM-NEXT: mul s3, s1, s3
+; RV64IM-NEXT: mul s4, s1, s4
+; RV64IM-NEXT: mul s5, s1, s5
+; RV64IM-NEXT: mul s6, s1, s6
+; RV64IM-NEXT: mul s7, s1, s7
+; RV64IM-NEXT: mul s8, s1, s8
+; RV64IM-NEXT: mul s9, s1, s9
+; RV64IM-NEXT: mul s10, s1, s10
+; RV64IM-NEXT: mul s11, s1, s11
+; RV64IM-NEXT: mul ra, s1, ra
+; RV64IM-NEXT: mul a2, s1, a2
+; RV64IM-NEXT: mul t1, s1, t1
+; RV64IM-NEXT: mul a7, s1, a7
+; RV64IM-NEXT: mul t3, s1, t3
+; RV64IM-NEXT: mul t0, s1, t0
+; RV64IM-NEXT: mul a1, s1, a1
+; RV64IM-NEXT: mul t2, s1, t2
+; RV64IM-NEXT: xor s1, s2, s3
+; RV64IM-NEXT: xor s2, s8, s9
+; RV64IM-NEXT: xor a7, a7, t3
+; RV64IM-NEXT: ld t3, 0(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor t3, t3, t4
+; RV64IM-NEXT: xor a3, a6, a3
+; RV64IM-NEXT: xor a4, a5, a4
+; RV64IM-NEXT: xor a5, t2, t5
+; RV64IM-NEXT: xor a6, s1, s4
+; RV64IM-NEXT: xor t2, s2, s10
+; RV64IM-NEXT: xor a7, a7, t0
+; RV64IM-NEXT: xor a3, t3, a3
+; RV64IM-NEXT: xor a0, a4, a0
+; RV64IM-NEXT: xor a4, a5, t6
+; RV64IM-NEXT: xor a5, a6, s5
+; RV64IM-NEXT: xor a6, t2, s11
+; RV64IM-NEXT: xor a0, a3, a0
+; RV64IM-NEXT: xor a4, a4, s0
+; RV64IM-NEXT: xor a3, a5, s6
+; RV64IM-NEXT: xor a5, a6, ra
+; RV64IM-NEXT: xor a0, a0, a4
+; RV64IM-NEXT: xor a3, a3, s7
+; RV64IM-NEXT: xor a2, a5, a2
+; RV64IM-NEXT: xor a0, a0, a3
+; RV64IM-NEXT: xor a2, a2, t1
+; RV64IM-NEXT: xor a0, a0, a2
+; RV64IM-NEXT: xor a1, a7, a1
+; RV64IM-NEXT: xor a0, a0, a1
+; RV64IM-NEXT: slli a0, a0, 1
+; RV64IM-NEXT: srli a0, a0, 32
+; RV64IM-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s1, 104(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s2, 96(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s3, 88(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s4, 80(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s5, 72(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s6, 64(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s7, 56(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s8, 48(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s9, 40(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s10, 32(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s11, 24(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: addi sp, sp, 128
+; RV64IM-NEXT: ret
+ %a.ext = zext i32 %a to i64
+ %b.ext = zext i32 %b to i64
+ %clmul = call i64 @llvm.clmul.i64(i64 %a.ext, i64 %b.ext)
+ %res.ext = lshr i64 %clmul, 31
+ %res = trunc i64 %res.ext to i32
+ ret i32 %res
+}
+
+define i4 @clmulh_i4(i4 %a, i4 %b) nounwind {
+; CHECK-LABEL: clmulh_i4:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 15
+; CHECK-NEXT: andi a2, a1, 4
+; CHECK-NEXT: andi a3, a1, 2
+; CHECK-NEXT: andi a1, a1, 8
+; CHECK-NEXT: mul a2, a0, a2
+; CHECK-NEXT: mul a3, a0, a3
+; CHECK-NEXT: xor a2, a3, a2
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: xor a0, a2, a0
+; CHECK-NEXT: srli a0, a0, 4
+; CHECK-NEXT: ret
+ %a.ext = zext i4 %a to i8
+ %b.ext = zext i4 %b to i8
+ %clmul = call i8 @llvm.clmul.i8(i8 %a.ext, i8 %b.ext)
+ %res.ext = lshr i8 %clmul, 4
+ %res = trunc i8 %res.ext to i4
+ ret i4 %res
+}
+
+define i4 @clmulh_i4_bitreverse(i4 %a, i4 %b) nounwind {
+; CHECK-LABEL: clmulh_i4_bitreverse:
+; CHECK: # %bb.0:
+; CHECK-NEXT: andi a0, a0, 15
+; CHECK-NEXT: andi a2, a1, 4
+; CHECK-NEXT: andi a3, a1, 2
+; CHECK-NEXT: andi a1, a1, 8
+; CHECK-NEXT: mul a2, a0, a2
+; CHECK-NEXT: mul a3, a0, a3
+; CHECK-NEXT: xor a2, a3, a2
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: xor a0, a2, a0
+; CHECK-NEXT: srli a0, a0, 4
+; CHECK-NEXT: ret
+ %a.rev = call i4 @llvm.bitreverse.i4(i4 %a)
+ %b.rev = call i4 @llvm.bitreverse.i4(i4 %b)
+ %clmul = call i4 @llvm.clmul.i4(i4 %a.rev, i4 %b.rev)
+ %clmul.rev = call i4 @llvm.bitreverse.i4(i4 %clmul)
+ %res = lshr i4 %clmul.rev, 1
+ ret i4 %res
+}
+
+
+define i8 @clmulh_i8(i8 %a, i8 %b) nounwind {
+; RV32IM-LABEL: clmulh_i8:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: zext.b a0, a0
+; RV32IM-NEXT: andi a2, a1, 2
+; RV32IM-NEXT: andi a3, a1, 1
+; RV32IM-NEXT: andi a4, a1, 4
+; RV32IM-NEXT: andi a5, a1, 8
+; RV32IM-NEXT: mul a2, a0, a2
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: andi a3, a1, 16
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: andi a5, a1, 32
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a3, a3, a5
+; RV32IM-NEXT: xor a2, a2, a4
+; RV32IM-NEXT: andi a4, a1, 64
+; RV32IM-NEXT: andi a1, a1, 128
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: xor a3, a3, a4
+; RV32IM-NEXT: xor a2, a2, a3
+; RV32IM-NEXT: mul a0, a0, a1
+; RV32IM-NEXT: xor a0, a2, a0
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: srli a0, a0, 24
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: clmulh_i8:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: zext.b a0, a0
+; RV64IM-NEXT: andi a2, a1, 2
+; RV64IM-NEXT: andi a3, a1, 1
+; RV64IM-NEXT: andi a4, a1, 4
+; RV64IM-NEXT: andi a5, a1, 8
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: xor a2, a3, a2
+; RV64IM-NEXT: andi a3, a1, 16
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: xor a4, a4, a5
+; RV64IM-NEXT: andi a5, a1, 32
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: xor a3, a3, a5
+; RV64IM-NEXT: xor a2, a2, a4
+; RV64IM-NEXT: andi a4, a1, 64
+; RV64IM-NEXT: andi a1, a1, 128
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: xor a3, a3, a4
+; RV64IM-NEXT: xor a2, a2, a3
+; RV64IM-NEXT: mul a0, a0, a1
+; RV64IM-NEXT: xor a0, a2, a0
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: srli a0, a0, 56
+; RV64IM-NEXT: ret
+ %a.ext = zext i8 %a to i16
+ %b.ext = zext i8 %b to i16
+ %clmul = call i16 @llvm.clmul.i16(i16 %a.ext, i16 %b.ext)
+ %res.ext = lshr i16 %clmul, 8
+ %res = trunc i16 %res.ext to i8
+ ret i8 %res
+}
+
+define i16 @clmulh_i16(i16 %a, i16 %b) nounwind {
+; RV32IM-LABEL: clmulh_i16:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: slli a0, a0, 16
+; RV32IM-NEXT: andi a2, a1, 2
+; RV32IM-NEXT: andi a3, a1, 1
+; RV32IM-NEXT: andi a4, a1, 4
+; RV32IM-NEXT: andi a5, a1, 8
+; RV32IM-NEXT: andi a6, a1, 16
+; RV32IM-NEXT: andi a7, a1, 32
+; RV32IM-NEXT: srli a0, a0, 16
+; RV32IM-NEXT: mul a2, a0, a2
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: andi a3, a1, 64
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: andi a5, a1, 128
+; RV32IM-NEXT: mul a6, a0, a6
+; RV32IM-NEXT: mul a7, a0, a7
+; RV32IM-NEXT: xor a6, a6, a7
+; RV32IM-NEXT: andi a7, a1, 256
+; RV32IM-NEXT: mul a5, a0, a5
+; RV32IM-NEXT: mul a7, a0, a7
+; RV32IM-NEXT: xor a5, a5, a7
+; RV32IM-NEXT: andi a7, a1, 512
+; RV32IM-NEXT: xor a2, a2, a4
+; RV32IM-NEXT: li a4, 1
+; RV32IM-NEXT: mul a3, a0, a3
+; RV32IM-NEXT: xor a3, a6, a3
+; RV32IM-NEXT: lui a6, 1
+; RV32IM-NEXT: mul a7, a0, a7
+; RV32IM-NEXT: xor a5, a5, a7
+; RV32IM-NEXT: lui a7, 2
+; RV32IM-NEXT: slli a4, a4, 11
+; RV32IM-NEXT: and a6, a1, a6
+; RV32IM-NEXT: and a4, a1, a4
+; RV32IM-NEXT: mul a6, a0, a6
+; RV32IM-NEXT: mul a4, a0, a4
+; RV32IM-NEXT: xor a4, a4, a6
+; RV32IM-NEXT: lui a6, 4
+; RV32IM-NEXT: xor a2, a2, a3
+; RV32IM-NEXT: lui a3, 8
+; RV32IM-NEXT: and a7, a1, a7
+; RV32IM-NEXT: and a6, a1, a6
+; RV32IM-NEXT: and a3, a1, a3
+; RV32IM-NEXT: andi a1, a1, 1024
+; RV32IM-NEXT: mul a1, a0, a1
+; RV32IM-NEXT: xor a1, a5, a1
+; RV32IM-NEXT: mul a5, a0, a7
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: xor a1, a2, a1
+; RV32IM-NEXT: mul a2, a0, a6
+; RV32IM-NEXT: xor a2, a4, a2
+; RV32IM-NEXT: xor a1, a1, a2
+; RV32IM-NEXT: mul a0, a0, a3
+; RV32IM-NEXT: xor a0, a1, a0
+; RV32IM-NEXT: srli a0, a0, 16
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: clmulh_i16:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: slli a0, a0, 48
+; RV64IM-NEXT: andi a2, a1, 2
+; RV64IM-NEXT: andi a3, a1, 1
+; RV64IM-NEXT: andi a4, a1, 4
+; RV64IM-NEXT: andi a5, a1, 8
+; RV64IM-NEXT: andi a6, a1, 16
+; RV64IM-NEXT: andi a7, a1, 32
+; RV64IM-NEXT: srli a0, a0, 48
+; RV64IM-NEXT: mul a2, a0, a2
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: xor a2, a3, a2
+; RV64IM-NEXT: andi a3, a1, 64
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: xor a4, a4, a5
+; RV64IM-NEXT: andi a5, a1, 128
+; RV64IM-NEXT: mul a6, a0, a6
+; RV64IM-NEXT: mul a7, a0, a7
+; RV64IM-NEXT: xor a6, a6, a7
+; RV64IM-NEXT: andi a7, a1, 256
+; RV64IM-NEXT: mul a5, a0, a5
+; RV64IM-NEXT: mul a7, a0, a7
+; RV64IM-NEXT: xor a5, a5, a7
+; RV64IM-NEXT: andi a7, a1, 512
+; RV64IM-NEXT: xor a2, a2, a4
+; RV64IM-NEXT: li a4, 1
+; RV64IM-NEXT: mul a3, a0, a3
+; RV64IM-NEXT: xor a3, a6, a3
+; RV64IM-NEXT: lui a6, 1
+; RV64IM-NEXT: mul a7, a0, a7
+; RV64IM-NEXT: xor a5, a5, a7
+; RV64IM-NEXT: lui a7, 2
+; RV64IM-NEXT: slli a4, a4, 11
+; RV64IM-NEXT: and a6, a1, a6
+; RV64IM-NEXT: and a4, a1, a4
+; RV64IM-NEXT: mul a6, a0, a6
+; RV64IM-NEXT: mul a4, a0, a4
+; RV64IM-NEXT: xor a4, a4, a6
+; RV64IM-NEXT: lui a6, 4
+; RV64IM-NEXT: xor a2, a2, a3
+; RV64IM-NEXT: lui a3, 8
+; RV64IM-NEXT: and a7, a1, a7
+; RV64IM-NEXT: and a6, a1, a6
+; RV64IM-NEXT: and a3, a1, a3
+; RV64IM-NEXT: andi a1, a1, 1024
+; RV64IM-NEXT: mul a1, a0, a1
+; RV64IM-NEXT: xor a1, a5, a1
+; RV64IM-NEXT: mul a5, a0, a7
+; RV64IM-NEXT: xor a4, a4, a5
+; RV64IM-NEXT: xor a1, a2, a1
+; RV64IM-NEXT: mul a2, a0, a6
+; RV64IM-NEXT: xor a2, a4, a2
+; RV64IM-NEXT: xor a1, a1, a2
+; RV64IM-NEXT: mul a0, a0, a3
+; RV64IM-NEXT: xor a0, a1, a0
+; RV64IM-NEXT: srliw a0, a0, 16
+; RV64IM-NEXT: ret
+ %a.ext = zext i16 %a to i32
+ %b.ext = zext i16 %b to i32
+ %clmul = call i32 @llvm.clmul.i32(i32 %a.ext, i32 %b.ext)
+ %res.ext = lshr i32 %clmul, 16
+ %res = trunc i32 %res.ext to i16
+ ret i16 %res
+}
+
+define i32 @clmulh_i32(i32 %a, i32 %b) nounwind {
+; RV32IM-LABEL: clmulh_i32:
+; RV32IM: # %bb.0:
+; RV32IM-NEXT: addi sp, sp, -144
+; RV32IM-NEXT: sw ra, 140(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s0, 136(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s1, 132(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s2, 128(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s3, 124(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s4, 120(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s5, 116(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s6, 112(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s7, 108(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s8, 104(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s9, 100(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s10, 96(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: sw s11, 92(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: srli t0, a0, 8
+; RV32IM-NEXT: lui a3, 16
+; RV32IM-NEXT: srli t1, a0, 24
+; RV32IM-NEXT: slli a2, a0, 24
+; RV32IM-NEXT: lui s1, 61681
+; RV32IM-NEXT: lui s3, 209715
+; RV32IM-NEXT: lui a6, 349525
+; RV32IM-NEXT: srli t4, a1, 8
+; RV32IM-NEXT: srli t6, a1, 24
+; RV32IM-NEXT: slli a4, a1, 24
+; RV32IM-NEXT: li t3, 1
+; RV32IM-NEXT: lui s11, 2
+; RV32IM-NEXT: lui t2, 4
+; RV32IM-NEXT: lui s10, 8
+; RV32IM-NEXT: lui t5, 32
+; RV32IM-NEXT: lui s0, 64
+; RV32IM-NEXT: lui s2, 128
+; RV32IM-NEXT: lui s4, 256
+; RV32IM-NEXT: lui s5, 512
+; RV32IM-NEXT: lui s6, 1024
+; RV32IM-NEXT: lui s7, 2048
+; RV32IM-NEXT: lui s8, 4096
+; RV32IM-NEXT: lui s9, 8192
+; RV32IM-NEXT: lui ra, 16384
+; RV32IM-NEXT: addi a3, a3, -256
+; RV32IM-NEXT: lui a5, 16
+; RV32IM-NEXT: and t0, t0, a3
+; RV32IM-NEXT: or t1, t0, t1
+; RV32IM-NEXT: lui a7, 32768
+; RV32IM-NEXT: and t4, t4, a3
+; RV32IM-NEXT: or t6, t4, t6
+; RV32IM-NEXT: lui t0, 65536
+; RV32IM-NEXT: and a0, a0, a3
+; RV32IM-NEXT: mv t4, a3
+; RV32IM-NEXT: sw a3, 88(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: slli a0, a0, 8
+; RV32IM-NEXT: or a2, a2, a0
+; RV32IM-NEXT: lui a3, 131072
+; RV32IM-NEXT: and a1, a1, t4
+; RV32IM-NEXT: slli a1, a1, 8
+; RV32IM-NEXT: or a0, a4, a1
+; RV32IM-NEXT: lui a1, 262144
+; RV32IM-NEXT: addi s1, s1, -241
+; RV32IM-NEXT: addi s3, s3, 819
+; RV32IM-NEXT: or a2, a2, t1
+; RV32IM-NEXT: addi a4, a6, 1365
+; RV32IM-NEXT: sw a4, 84(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: or a0, a0, t6
+; RV32IM-NEXT: srli a6, a2, 4
+; RV32IM-NEXT: and a2, a2, s1
+; RV32IM-NEXT: and a6, a6, s1
+; RV32IM-NEXT: slli a2, a2, 4
+; RV32IM-NEXT: or a2, a6, a2
+; RV32IM-NEXT: srli a6, a0, 4
+; RV32IM-NEXT: and a0, a0, s1
+; RV32IM-NEXT: and a6, a6, s1
+; RV32IM-NEXT: slli a0, a0, 4
+; RV32IM-NEXT: or a0, a6, a0
+; RV32IM-NEXT: srli a6, a2, 2
+; RV32IM-NEXT: and a2, a2, s3
+; RV32IM-NEXT: and a6, a6, s3
+; RV32IM-NEXT: slli a2, a2, 2
+; RV32IM-NEXT: or a2, a6, a2
+; RV32IM-NEXT: srli a6, a0, 2
+; RV32IM-NEXT: and a0, a0, s3
+; RV32IM-NEXT: and a6, a6, s3
+; RV32IM-NEXT: slli a0, a0, 2
+; RV32IM-NEXT: or a0, a6, a0
+; RV32IM-NEXT: srli a6, a2, 1
+; RV32IM-NEXT: and a2, a2, a4
+; RV32IM-NEXT: and a6, a6, a4
+; RV32IM-NEXT: slli a2, a2, 1
+; RV32IM-NEXT: or a6, a6, a2
+; RV32IM-NEXT: srli a2, a0, 1
+; RV32IM-NEXT: and a0, a0, a4
+; RV32IM-NEXT: and a2, a2, a4
+; RV32IM-NEXT: slli a0, a0, 1
+; RV32IM-NEXT: or a0, a2, a0
+; RV32IM-NEXT: lui a2, 524288
+; RV32IM-NEXT: slli t3, t3, 11
+; RV32IM-NEXT: and t3, a0, t3
+; RV32IM-NEXT: lui a4, 1
+; RV32IM-NEXT: and t4, a0, a4
+; RV32IM-NEXT: and s11, a0, s11
+; RV32IM-NEXT: and a4, a0, t2
+; RV32IM-NEXT: sw a4, 80(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a0, s10
+; RV32IM-NEXT: sw a4, 72(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a5, a0, a5
+; RV32IM-NEXT: sw a5, 68(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a0, t5
+; RV32IM-NEXT: sw a4, 64(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and s0, a0, s0
+; RV32IM-NEXT: and a4, a0, s2
+; RV32IM-NEXT: sw a4, 60(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and s4, a0, s4
+; RV32IM-NEXT: and a4, a0, s5
+; RV32IM-NEXT: sw a4, 56(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a0, s6
+; RV32IM-NEXT: sw a4, 52(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a0, s7
+; RV32IM-NEXT: sw a4, 48(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a0, s8
+; RV32IM-NEXT: sw a4, 44(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a0, s9
+; RV32IM-NEXT: sw a4, 40(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a0, ra
+; RV32IM-NEXT: sw a4, 36(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a0, a7
+; RV32IM-NEXT: sw a4, 32(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a4, a0, t0
+; RV32IM-NEXT: sw a4, 28(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a3, a0, a3
+; RV32IM-NEXT: sw a3, 24(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a1, a0, a1
+; RV32IM-NEXT: sw a1, 20(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: and a2, a0, a2
+; RV32IM-NEXT: sw a2, 16(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: andi ra, a0, 2
+; RV32IM-NEXT: andi a1, a0, 1
+; RV32IM-NEXT: andi a2, a0, 4
+; RV32IM-NEXT: andi a3, a0, 8
+; RV32IM-NEXT: andi a4, a0, 16
+; RV32IM-NEXT: andi a5, a0, 32
+; RV32IM-NEXT: andi a7, a0, 64
+; RV32IM-NEXT: andi t0, a0, 128
+; RV32IM-NEXT: andi t1, a0, 256
+; RV32IM-NEXT: andi t2, a0, 512
+; RV32IM-NEXT: andi a0, a0, 1024
+; RV32IM-NEXT: mul ra, a6, ra
+; RV32IM-NEXT: mul s10, a6, a1
+; RV32IM-NEXT: mul s9, a6, a2
+; RV32IM-NEXT: mul s5, a6, a3
+; RV32IM-NEXT: mul s6, a6, a4
+; RV32IM-NEXT: mul s2, a6, a5
+; RV32IM-NEXT: mul a1, a6, a7
+; RV32IM-NEXT: sw a1, 4(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a1, a6, t0
+; RV32IM-NEXT: sw a1, 76(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul t6, a6, t1
+; RV32IM-NEXT: mul t2, a6, t2
+; RV32IM-NEXT: mul s7, a6, a0
+; RV32IM-NEXT: mul a0, a6, t3
+; RV32IM-NEXT: sw a0, 8(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a0, a6, t4
+; RV32IM-NEXT: sw a0, 12(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul t1, a6, s11
+; RV32IM-NEXT: lw a0, 80(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a7, a6, a0
+; RV32IM-NEXT: lw a0, 72(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul t5, a6, a0
+; RV32IM-NEXT: lw a0, 68(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s8, a6, a0
+; RV32IM-NEXT: lw a0, 64(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a0, a6, a0
+; RV32IM-NEXT: sw a0, 68(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: mul a0, a6, s0
+; RV32IM-NEXT: sw a0, 72(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lw a0, 60(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a3, a6, a0
+; RV32IM-NEXT: mul a2, a6, s4
+; RV32IM-NEXT: lw a0, 56(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a5, a6, a0
+; RV32IM-NEXT: lw a0, 52(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul t3, a6, a0
+; RV32IM-NEXT: lw a0, 48(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s4, a6, a0
+; RV32IM-NEXT: lw a0, 44(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a1, a6, a0
+; RV32IM-NEXT: lw a0, 40(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a0, a6, a0
+; RV32IM-NEXT: lw a4, 36(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a4, a6, a4
+; RV32IM-NEXT: lw t0, 32(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul t0, a6, t0
+; RV32IM-NEXT: lw t4, 28(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul t4, a6, t4
+; RV32IM-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s0, a6, s0
+; RV32IM-NEXT: lw s11, 20(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul s11, a6, s11
+; RV32IM-NEXT: sw s11, 80(sp) # 4-byte Folded Spill
+; RV32IM-NEXT: lw s11, 16(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: mul a6, a6, s11
+; RV32IM-NEXT: xor s10, s10, ra
+; RV32IM-NEXT: xor s5, s9, s5
+; RV32IM-NEXT: xor s2, s6, s2
+; RV32IM-NEXT: xor t2, t6, t2
+; RV32IM-NEXT: xor a7, t1, a7
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: xor a0, a1, a0
+; RV32IM-NEXT: xor a1, s10, s5
+; RV32IM-NEXT: lw a3, 4(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, s2, a3
+; RV32IM-NEXT: xor t1, t2, s7
+; RV32IM-NEXT: xor a7, a7, t5
+; RV32IM-NEXT: xor a2, a2, a5
+; RV32IM-NEXT: xor a0, a0, a4
+; RV32IM-NEXT: xor a1, a1, a3
+; RV32IM-NEXT: lw a3, 8(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, t1, a3
+; RV32IM-NEXT: xor a4, a7, s8
+; RV32IM-NEXT: xor a2, a2, t3
+; RV32IM-NEXT: xor a0, a0, t0
+; RV32IM-NEXT: lw a5, 76(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a1, a1, a5
+; RV32IM-NEXT: lw a5, 12(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a3, a3, a5
+; RV32IM-NEXT: lw a5, 68(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: xor a2, a2, s4
+; RV32IM-NEXT: xor a0, a0, t4
+; RV32IM-NEXT: lw a5, 72(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a4, a4, a5
+; RV32IM-NEXT: xor a0, a0, s0
+; RV32IM-NEXT: lui a5, 349525
+; RV32IM-NEXT: addi a5, a5, 1364
+; RV32IM-NEXT: xor a3, a1, a3
+; RV32IM-NEXT: slli a1, a1, 24
+; RV32IM-NEXT: xor a3, a3, a4
+; RV32IM-NEXT: lw a4, 80(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: xor a0, a0, a4
+; RV32IM-NEXT: xor a2, a3, a2
+; RV32IM-NEXT: xor a0, a0, a6
+; RV32IM-NEXT: lw a6, 88(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: and a3, a2, a6
+; RV32IM-NEXT: srli a4, a2, 8
+; RV32IM-NEXT: xor a0, a2, a0
+; RV32IM-NEXT: slli a3, a3, 8
+; RV32IM-NEXT: and a2, a4, a6
+; RV32IM-NEXT: srli a0, a0, 24
+; RV32IM-NEXT: or a1, a1, a3
+; RV32IM-NEXT: or a0, a2, a0
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: srli a1, a0, 4
+; RV32IM-NEXT: and a0, a0, s1
+; RV32IM-NEXT: and a1, a1, s1
+; RV32IM-NEXT: slli a0, a0, 4
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: srli a1, a0, 2
+; RV32IM-NEXT: and a0, a0, s3
+; RV32IM-NEXT: and a1, a1, s3
+; RV32IM-NEXT: slli a0, a0, 2
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: srli a1, a0, 1
+; RV32IM-NEXT: lw a2, 84(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: and a0, a0, a2
+; RV32IM-NEXT: and a1, a1, a5
+; RV32IM-NEXT: slli a0, a0, 1
+; RV32IM-NEXT: or a0, a1, a0
+; RV32IM-NEXT: srli a0, a0, 1
+; RV32IM-NEXT: lw ra, 140(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s0, 136(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s1, 132(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s2, 128(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s3, 124(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s4, 120(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s5, 116(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s6, 112(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s7, 108(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s8, 104(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s9, 100(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s10, 96(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: lw s11, 92(sp) # 4-byte Folded Reload
+; RV32IM-NEXT: addi sp, sp, 144
+; RV32IM-NEXT: ret
+;
+; RV64IM-LABEL: clmulh_i32:
+; RV64IM: # %bb.0:
+; RV64IM-NEXT: addi sp, sp, -128
+; RV64IM-NEXT: sd ra, 120(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s0, 112(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s1, 104(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s2, 96(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s3, 88(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s4, 80(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s5, 72(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s6, 64(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s7, 56(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s8, 48(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s9, 40(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s10, 32(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: sd s11, 24(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: slli a6, a0, 32
+; RV64IM-NEXT: andi t1, a1, 2
+; RV64IM-NEXT: andi t3, a1, 1
+; RV64IM-NEXT: andi a5, a1, 4
+; RV64IM-NEXT: andi a7, a1, 8
+; RV64IM-NEXT: andi a3, a1, 16
+; RV64IM-NEXT: andi a4, a1, 32
+; RV64IM-NEXT: andi a0, a1, 64
+; RV64IM-NEXT: sd a0, 16(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: andi t0, a1, 128
+; RV64IM-NEXT: andi t2, a1, 256
+; RV64IM-NEXT: andi a0, a1, 512
+; RV64IM-NEXT: sd a0, 8(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: li a2, 1
+; RV64IM-NEXT: lui t5, 1
+; RV64IM-NEXT: lui t6, 2
+; RV64IM-NEXT: lui s0, 4
+; RV64IM-NEXT: lui s2, 8
+; RV64IM-NEXT: lui s3, 16
+; RV64IM-NEXT: lui s4, 32
+; RV64IM-NEXT: lui s5, 64
+; RV64IM-NEXT: lui s6, 128
+; RV64IM-NEXT: lui s7, 256
+; RV64IM-NEXT: lui s8, 512
+; RV64IM-NEXT: lui s9, 1024
+; RV64IM-NEXT: lui s10, 2048
+; RV64IM-NEXT: lui s11, 4096
+; RV64IM-NEXT: lui ra, 8192
+; RV64IM-NEXT: lui a0, 16384
+; RV64IM-NEXT: srli s1, a6, 32
+; RV64IM-NEXT: mul a6, s1, t1
+; RV64IM-NEXT: mul t1, s1, t3
+; RV64IM-NEXT: xor a6, t1, a6
+; RV64IM-NEXT: sd a6, 0(sp) # 8-byte Folded Spill
+; RV64IM-NEXT: lui t1, 32768
+; RV64IM-NEXT: mul a5, s1, a5
+; RV64IM-NEXT: mul a7, s1, a7
+; RV64IM-NEXT: xor t4, a5, a7
+; RV64IM-NEXT: lui a7, 65536
+; RV64IM-NEXT: mul a3, s1, a3
+; RV64IM-NEXT: mul a4, s1, a4
+; RV64IM-NEXT: xor a6, a3, a4
+; RV64IM-NEXT: lui t3, 131072
+; RV64IM-NEXT: mul a4, s1, t0
+; RV64IM-NEXT: mul t0, s1, t2
+; RV64IM-NEXT: xor a5, a4, t0
+; RV64IM-NEXT: lui t0, 262144
+; RV64IM-NEXT: slli t2, a2, 11
+; RV64IM-NEXT: and t5, a1, t5
+; RV64IM-NEXT: and t6, a1, t6
+; RV64IM-NEXT: and s0, a1, s0
+; RV64IM-NEXT: and s2, a1, s2
+; RV64IM-NEXT: and s3, a1, s3
+; RV64IM-NEXT: and s4, a1, s4
+; RV64IM-NEXT: and s5, a1, s5
+; RV64IM-NEXT: and s6, a1, s6
+; RV64IM-NEXT: and s7, a1, s7
+; RV64IM-NEXT: and s8, a1, s8
+; RV64IM-NEXT: and s9, a1, s9
+; RV64IM-NEXT: and s10, a1, s10
+; RV64IM-NEXT: and s11, a1, s11
+; RV64IM-NEXT: and ra, a1, ra
+; RV64IM-NEXT: and a2, a1, a0
+; RV64IM-NEXT: and t1, a1, t1
+; RV64IM-NEXT: and a7, a1, a7
+; RV64IM-NEXT: and t3, a1, t3
+; RV64IM-NEXT: and t0, a1, t0
+; RV64IM-NEXT: and t2, a1, t2
+; RV64IM-NEXT: andi a0, a1, 1024
+; RV64IM-NEXT: srliw a1, a1, 31
+; RV64IM-NEXT: slli a1, a1, 31
+; RV64IM-NEXT: ld a3, 16(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a3, s1, a3
+; RV64IM-NEXT: ld a4, 8(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: mul a4, s1, a4
+; RV64IM-NEXT: mul a0, s1, a0
+; RV64IM-NEXT: mul t5, s1, t5
+; RV64IM-NEXT: mul t6, s1, t6
+; RV64IM-NEXT: mul s0, s1, s0
+; RV64IM-NEXT: mul s2, s1, s2
+; RV64IM-NEXT: mul s3, s1, s3
+; RV64IM-NEXT: mul s4, s1, s4
+; RV64IM-NEXT: mul s5, s1, s5
+; RV64IM-NEXT: mul s6, s1, s6
+; RV64IM-NEXT: mul s7, s1, s7
+; RV64IM-NEXT: mul s8, s1, s8
+; RV64IM-NEXT: mul s9, s1, s9
+; RV64IM-NEXT: mul s10, s1, s10
+; RV64IM-NEXT: mul s11, s1, s11
+; RV64IM-NEXT: mul ra, s1, ra
+; RV64IM-NEXT: mul a2, s1, a2
+; RV64IM-NEXT: mul t1, s1, t1
+; RV64IM-NEXT: mul a7, s1, a7
+; RV64IM-NEXT: mul t3, s1, t3
+; RV64IM-NEXT: mul t0, s1, t0
+; RV64IM-NEXT: mul a1, s1, a1
+; RV64IM-NEXT: mul t2, s1, t2
+; RV64IM-NEXT: xor s1, s2, s3
+; RV64IM-NEXT: xor s2, s8, s9
+; RV64IM-NEXT: xor a7, a7, t3
+; RV64IM-NEXT: ld t3, 0(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: xor t3, t3, t4
+; RV64IM-NEXT: xor a3, a6, a3
+; RV64IM-NEXT: xor a4, a5, a4
+; RV64IM-NEXT: xor a5, t2, t5
+; RV64IM-NEXT: xor a6, s1, s4
+; RV64IM-NEXT: xor t2, s2, s10
+; RV64IM-NEXT: xor a7, a7, t0
+; RV64IM-NEXT: xor a3, t3, a3
+; RV64IM-NEXT: xor a0, a4, a0
+; RV64IM-NEXT: xor a4, a5, t6
+; RV64IM-NEXT: xor a5, a6, s5
+; RV64IM-NEXT: xor a6, t2, s11
+; RV64IM-NEXT: xor a0, a3, a0
+; RV64IM-NEXT: xor a4, a4, s0
+; RV64IM-NEXT: xor a3, a5, s6
+; RV64IM-NEXT: xor a5, a6, ra
+; RV64IM-NEXT: xor a0, a0, a4
+; RV64IM-NEXT: xor a3, a3, s7
+; RV64IM-NEXT: xor a2, a5, a2
+; RV64IM-NEXT: xor a0, a0, a3
+; RV64IM-NEXT: xor a2, a2, t1
+; RV64IM-NEXT: xor a0, a0, a2
+; RV64IM-NEXT: xor a1, a7, a1
+; RV64IM-NEXT: xor a0, a0, a1
+; RV64IM-NEXT: srli a0, a0, 32
+; RV64IM-NEXT: ld ra, 120(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s0, 112(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s1, 104(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s2, 96(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s3, 88(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s4, 80(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s5, 72(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s6, 64(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s7, 56(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s8, 48(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s9, 40(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s10, 32(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: ld s11, 24(sp) # 8-byte Folded Reload
+; RV64IM-NEXT: addi sp, sp, 128
+; RV64IM-NEXT: ret
+ %a.ext = zext i32 %a to i64
+ %b.ext = zext i32 %b to i64
+ %clmul = call i64 @llvm.clmul.i64(i64 %a.ext, i64 %b.ext)
+ %res.ext = lshr i64 %clmul, 32
+ %res = trunc i64 %res.ext to i32
+ ret i32 %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
new file mode 100644
index 0000000000000..5c017fe72886c
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/clmul-sdnode.ll
@@ -0,0 +1,5742 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+define <vscale x 1 x i32> @clmul_nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i32> %y) nounwind {
+; CHECK-LABEL: clmul_nxv1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vand.vi v10, v9, 2
+; CHECK-NEXT: vand.vi v11, v9, 1
+; CHECK-NEXT: vmul.vv v10, v8, v10
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v11, v10
+; CHECK-NEXT: vand.vi v11, v9, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vi v11, v9, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2048
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4096
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8192
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 16384
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 32768
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 65536
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 131072
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 262144
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vand.vx v9, v9, a0
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vmul.vv v8, v8, v9
+; CHECK-NEXT: vxor.vv v8, v10, v8
+; CHECK-NEXT: ret
+ %a = call <vscale x 1 x i32> @llvm.clmul.nxv1i32(<vscale x 1 x i32> %x, <vscale x 1 x i32> %y)
+ ret <vscale x 1 x i32> %a
+}
+
+define <vscale x 2 x i32> @clmul_nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y) nounwind {
+; CHECK-LABEL: clmul_nxv2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vand.vi v10, v9, 2
+; CHECK-NEXT: vand.vi v11, v9, 1
+; CHECK-NEXT: vmul.vv v10, v8, v10
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v11, v10
+; CHECK-NEXT: vand.vi v11, v9, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vi v11, v9, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2048
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4096
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8192
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 16384
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 32768
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 65536
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 131072
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 262144
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vand.vx v9, v9, a0
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vmul.vv v8, v8, v9
+; CHECK-NEXT: vxor.vv v8, v10, v8
+; CHECK-NEXT: ret
+ %a = call <vscale x 2 x i32> @llvm.clmul.nxv2i32(<vscale x 2 x i32> %x, <vscale x 2 x i32> %y)
+ ret <vscale x 2 x i32> %a
+}
+
+define <vscale x 4 x i32> @clmul_nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) nounwind {
+; CHECK-LABEL: clmul_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; CHECK-NEXT: vand.vi v12, v10, 2
+; CHECK-NEXT: vand.vi v14, v10, 1
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v14, v12
+; CHECK-NEXT: vand.vi v14, v10, 4
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vi v14, v10, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 16
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 32
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 64
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 128
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 256
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 512
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 1024
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 2048
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 4096
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 8192
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 16384
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 32768
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 65536
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 131072
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 262144
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vand.vx v14, v10, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vand.vx v10, v10, a0
+; CHECK-NEXT: vmul.vv v14, v8, v14
+; CHECK-NEXT: vxor.vv v12, v12, v14
+; CHECK-NEXT: vmul.vv v8, v8, v10
+; CHECK-NEXT: vxor.vv v8, v12, v8
+; CHECK-NEXT: ret
+ %a = call <vscale x 4 x i32> @llvm.clmul.nxv4i32(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y)
+ ret <vscale x 4 x i32> %a
+}
+
+define <vscale x 8 x i32> @clmul_nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %y) nounwind {
+; CHECK-LABEL: clmul_nxv8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; CHECK-NEXT: vand.vi v12, v8, 2
+; CHECK-NEXT: vand.vi v16, v8, 1
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v16, v12
+; CHECK-NEXT: vand.vi v16, v8, 4
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vi v16, v8, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 16
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 32
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 64
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 128
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 256
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 512
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 1024
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 2048
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 4096
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 8192
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 16384
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 32768
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 65536
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 131072
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 262144
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vxor.vv v12, v12, v16
+; CHECK-NEXT: vand.vx v16, v8, a0
+; CHECK-NEXT: vmul.vv v8, v8, v16
+; CHECK-NEXT: vxor.vv v8, v12, v8
+; CHECK-NEXT: ret
+ %a = call <vscale x 8 x i32> @llvm.clmul.nxv8i32(<vscale x 8 x i32> %x, <vscale x 8 x i32> %x)
+ ret <vscale x 8 x i32> %a
+}
+
+define <vscale x 16 x i32> @clmul_nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y) nounwind {
+; CHECK-LABEL: clmul_nxv16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma
+; CHECK-NEXT: vand.vi v24, v16, 2
+; CHECK-NEXT: vand.vi v0, v16, 1
+; CHECK-NEXT: vmul.vv v24, v8, v24
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v0, v24
+; CHECK-NEXT: vand.vi v0, v16, 4
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vi v0, v16, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 16
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 32
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 64
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 128
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 256
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 512
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 1024
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 2048
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 4096
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 8192
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 16384
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 32768
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 65536
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 131072
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 262144
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vand.vx v0, v16, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vand.vx v16, v16, a0
+; CHECK-NEXT: vmul.vv v0, v8, v0
+; CHECK-NEXT: vxor.vv v24, v24, v0
+; CHECK-NEXT: vmul.vv v8, v8, v16
+; CHECK-NEXT: vxor.vv v8, v24, v8
+; CHECK-NEXT: ret
+ %a = call <vscale x 16 x i32> @llvm.clmul.nxv16i32(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y)
+ ret <vscale x 16 x i32> %a
+}
+
+define <vscale x 1 x i64> @clmul_nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %y) nounwind {
+; RV32-LABEL: clmul_nxv1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -352
+; RV32-NEXT: sw ra, 348(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 344(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 340(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 336(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 332(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 328(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 324(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 320(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 316(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 312(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 308(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 304(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 300(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 3
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: lui a1, 524288
+; RV32-NEXT: li t5, 1
+; RV32-NEXT: li a4, 2
+; RV32-NEXT: li a2, 4
+; RV32-NEXT: li s11, 8
+; RV32-NEXT: li a0, 16
+; RV32-NEXT: li ra, 32
+; RV32-NEXT: li s10, 64
+; RV32-NEXT: li s9, 128
+; RV32-NEXT: li s8, 256
+; RV32-NEXT: li s7, 512
+; RV32-NEXT: li s1, 1024
+; RV32-NEXT: lui s6, 1
+; RV32-NEXT: lui s5, 2
+; RV32-NEXT: lui s4, 4
+; RV32-NEXT: lui s3, 8
+; RV32-NEXT: lui s2, 16
+; RV32-NEXT: lui s0, 32
+; RV32-NEXT: lui t6, 64
+; RV32-NEXT: lui t4, 128
+; RV32-NEXT: lui t3, 256
+; RV32-NEXT: lui t2, 512
+; RV32-NEXT: lui t1, 1024
+; RV32-NEXT: lui t0, 2048
+; RV32-NEXT: lui a7, 4096
+; RV32-NEXT: lui a6, 8192
+; RV32-NEXT: lui a5, 16384
+; RV32-NEXT: lui a3, 32768
+; RV32-NEXT: sw a1, 272(sp)
+; RV32-NEXT: sw zero, 276(sp)
+; RV32-NEXT: sw zero, 264(sp)
+; RV32-NEXT: sw t5, 268(sp)
+; RV32-NEXT: sw zero, 256(sp)
+; RV32-NEXT: sw a4, 260(sp)
+; RV32-NEXT: lui a4, 65536
+; RV32-NEXT: sw zero, 248(sp)
+; RV32-NEXT: sw a2, 252(sp)
+; RV32-NEXT: lui a2, 131072
+; RV32-NEXT: sw zero, 240(sp)
+; RV32-NEXT: sw s11, 244(sp)
+; RV32-NEXT: vsetvli s11, zero, e64, m1, ta, ma
+; RV32-NEXT: vand.vi v13, v9, 2
+; RV32-NEXT: vand.vi v14, v9, 1
+; RV32-NEXT: vand.vi v12, v9, 4
+; RV32-NEXT: vand.vi v11, v9, 8
+; RV32-NEXT: sw zero, 232(sp)
+; RV32-NEXT: sw a0, 236(sp)
+; RV32-NEXT: vand.vx v10, v9, a0
+; RV32-NEXT: addi s11, sp, 272
+; RV32-NEXT: sw zero, 224(sp)
+; RV32-NEXT: sw ra, 228(sp)
+; RV32-NEXT: vand.vx v15, v9, ra
+; RV32-NEXT: addi ra, sp, 264
+; RV32-NEXT: sw zero, 216(sp)
+; RV32-NEXT: sw s10, 220(sp)
+; RV32-NEXT: vand.vx v16, v9, s10
+; RV32-NEXT: addi s10, sp, 256
+; RV32-NEXT: sw zero, 208(sp)
+; RV32-NEXT: sw s9, 212(sp)
+; RV32-NEXT: vand.vx v17, v9, s9
+; RV32-NEXT: addi s9, sp, 248
+; RV32-NEXT: sw zero, 200(sp)
+; RV32-NEXT: sw s8, 204(sp)
+; RV32-NEXT: vand.vx v18, v9, s8
+; RV32-NEXT: addi s8, sp, 240
+; RV32-NEXT: sw zero, 192(sp)
+; RV32-NEXT: sw s7, 196(sp)
+; RV32-NEXT: vand.vx v19, v9, s7
+; RV32-NEXT: addi s7, sp, 232
+; RV32-NEXT: sw zero, 184(sp)
+; RV32-NEXT: sw s1, 188(sp)
+; RV32-NEXT: vand.vx v20, v9, s1
+; RV32-NEXT: slli t5, t5, 11
+; RV32-NEXT: vand.vx v21, v9, s6
+; RV32-NEXT: sw zero, 176(sp)
+; RV32-NEXT: sw t5, 180(sp)
+; RV32-NEXT: sw zero, 168(sp)
+; RV32-NEXT: sw s6, 172(sp)
+; RV32-NEXT: addi s6, sp, 216
+; RV32-NEXT: vand.vx v22, v9, s5
+; RV32-NEXT: sw zero, 160(sp)
+; RV32-NEXT: sw s5, 164(sp)
+; RV32-NEXT: addi s5, sp, 208
+; RV32-NEXT: vand.vx v23, v9, s4
+; RV32-NEXT: sw zero, 152(sp)
+; RV32-NEXT: sw s4, 156(sp)
+; RV32-NEXT: addi s4, sp, 200
+; RV32-NEXT: vand.vx v24, v9, s3
+; RV32-NEXT: sw zero, 144(sp)
+; RV32-NEXT: sw s3, 148(sp)
+; RV32-NEXT: addi s3, sp, 192
+; RV32-NEXT: vand.vx v25, v9, s2
+; RV32-NEXT: sw zero, 136(sp)
+; RV32-NEXT: sw s2, 140(sp)
+; RV32-NEXT: addi s2, sp, 184
+; RV32-NEXT: vand.vx v26, v9, s0
+; RV32-NEXT: sw zero, 128(sp)
+; RV32-NEXT: sw s0, 132(sp)
+; RV32-NEXT: addi s1, sp, 176
+; RV32-NEXT: vand.vx v27, v9, t6
+; RV32-NEXT: sw zero, 120(sp)
+; RV32-NEXT: sw t6, 124(sp)
+; RV32-NEXT: addi s0, sp, 168
+; RV32-NEXT: vand.vx v28, v9, t4
+; RV32-NEXT: sw zero, 112(sp)
+; RV32-NEXT: sw t4, 116(sp)
+; RV32-NEXT: addi t6, sp, 160
+; RV32-NEXT: vand.vx v29, v9, t3
+; RV32-NEXT: sw zero, 104(sp)
+; RV32-NEXT: sw t3, 108(sp)
+; RV32-NEXT: addi t4, sp, 152
+; RV32-NEXT: vand.vx v30, v9, t2
+; RV32-NEXT: sw zero, 96(sp)
+; RV32-NEXT: sw t2, 100(sp)
+; RV32-NEXT: addi t3, sp, 144
+; RV32-NEXT: vand.vx v31, v9, t1
+; RV32-NEXT: sw zero, 88(sp)
+; RV32-NEXT: sw t1, 92(sp)
+; RV32-NEXT: addi t2, sp, 136
+; RV32-NEXT: vand.vx v7, v9, t0
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw t0, 84(sp)
+; RV32-NEXT: addi t1, sp, 128
+; RV32-NEXT: vand.vx v6, v9, a7
+; RV32-NEXT: sw zero, 72(sp)
+; RV32-NEXT: sw a7, 76(sp)
+; RV32-NEXT: addi t0, sp, 120
+; RV32-NEXT: vand.vx v5, v9, a6
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw a6, 68(sp)
+; RV32-NEXT: addi a7, sp, 112
+; RV32-NEXT: vand.vx v4, v9, a5
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw a5, 60(sp)
+; RV32-NEXT: addi a6, sp, 104
+; RV32-NEXT: vand.vx v3, v9, a3
+; RV32-NEXT: sw zero, 48(sp)
+; RV32-NEXT: sw a3, 52(sp)
+; RV32-NEXT: addi a5, sp, 96
+; RV32-NEXT: vand.vx v2, v9, a4
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw a4, 44(sp)
+; RV32-NEXT: addi a4, sp, 88
+; RV32-NEXT: vand.vx v1, v9, a2
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: addi a3, sp, 80
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: lui a0, 262144
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: addi a2, sp, 72
+; RV32-NEXT: vand.vx v0, v9, t5
+; RV32-NEXT: addi a1, sp, 64
+; RV32-NEXT: vmul.vv v13, v8, v13
+; RV32-NEXT: vmul.vv v14, v8, v14
+; RV32-NEXT: vxor.vi v14, v14, 0
+; RV32-NEXT: vxor.vv v14, v14, v13
+; RV32-NEXT: vlse64.v v13, (s11), zero
+; RV32-NEXT: addi s11, sp, 56
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v14, v14, v12
+; RV32-NEXT: vlse64.v v12, (ra), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: mv ra, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add t5, t5, ra
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs1r.v v12, (t5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi ra, sp, 48
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v14, v14, v11
+; RV32-NEXT: vlse64.v v11, (s10), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli s10, t5, 2
+; RV32-NEXT: add t5, s10, t5
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs1r.v v11, (t5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi s10, sp, 40
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v14, v14, v10
+; RV32-NEXT: vlse64.v v10, (s9), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs1r.v v10, (t5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi t5, sp, 32
+; RV32-NEXT: vmul.vv v15, v8, v15
+; RV32-NEXT: vxor.vv v15, v14, v15
+; RV32-NEXT: vlse64.v v10, (s8), zero
+; RV32-NEXT: csrr s8, vlenb
+; RV32-NEXT: slli s9, s8, 1
+; RV32-NEXT: add s8, s9, s8
+; RV32-NEXT: add s8, sp, s8
+; RV32-NEXT: addi s8, s8, 288
+; RV32-NEXT: vs1r.v v10, (s8) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi s8, sp, 24
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v16, v15, v16
+; RV32-NEXT: vlse64.v v10, (s7), zero
+; RV32-NEXT: csrr s7, vlenb
+; RV32-NEXT: slli s7, s7, 1
+; RV32-NEXT: add s7, sp, s7
+; RV32-NEXT: addi s7, s7, 288
+; RV32-NEXT: vs1r.v v10, (s7) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi s7, sp, 16
+; RV32-NEXT: vmul.vv v17, v8, v17
+; RV32-NEXT: vmul.vv v18, v8, v18
+; RV32-NEXT: vmul.vv v19, v8, v19
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vmul.vv v21, v8, v21
+; RV32-NEXT: vmul.vv v22, v8, v22
+; RV32-NEXT: vmul.vv v23, v8, v23
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v25, v8, v25
+; RV32-NEXT: vmul.vv v26, v8, v26
+; RV32-NEXT: vmul.vv v27, v8, v27
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vmul.vv v29, v8, v29
+; RV32-NEXT: vmul.vv v30, v8, v30
+; RV32-NEXT: vmul.vv v31, v8, v31
+; RV32-NEXT: vmul.vv v7, v8, v7
+; RV32-NEXT: vmul.vv v6, v8, v6
+; RV32-NEXT: vmul.vv v5, v8, v5
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vmul.vv v3, v8, v3
+; RV32-NEXT: vmul.vv v2, v8, v2
+; RV32-NEXT: vmul.vv v1, v8, v1
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v16, v16, v17
+; RV32-NEXT: addi s9, sp, 224
+; RV32-NEXT: vlse64.v v11, (s9), zero
+; RV32-NEXT: vxor.vv v16, v16, v18
+; RV32-NEXT: vlse64.v v10, (s6), zero
+; RV32-NEXT: csrr s6, vlenb
+; RV32-NEXT: add s6, sp, s6
+; RV32-NEXT: addi s6, s6, 288
+; RV32-NEXT: vs1r.v v10, (s6) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vxor.vv v16, v16, v19
+; RV32-NEXT: vlse64.v v10, (s5), zero
+; RV32-NEXT: addi s5, sp, 288
+; RV32-NEXT: vs1r.v v10, (s5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vxor.vv v16, v16, v20
+; RV32-NEXT: vlse64.v v12, (s4), zero
+; RV32-NEXT: vxor.vv v16, v16, v0
+; RV32-NEXT: vlse64.v v0, (s3), zero
+; RV32-NEXT: vxor.vv v16, v16, v21
+; RV32-NEXT: vlse64.v v21, (s2), zero
+; RV32-NEXT: vxor.vv v16, v16, v22
+; RV32-NEXT: vlse64.v v22, (s1), zero
+; RV32-NEXT: vxor.vv v16, v16, v23
+; RV32-NEXT: vlse64.v v23, (s0), zero
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: vlse64.v v24, (t6), zero
+; RV32-NEXT: vxor.vv v16, v16, v25
+; RV32-NEXT: vlse64.v v25, (t4), zero
+; RV32-NEXT: vxor.vv v16, v16, v26
+; RV32-NEXT: vlse64.v v26, (t3), zero
+; RV32-NEXT: vxor.vv v16, v16, v27
+; RV32-NEXT: vlse64.v v27, (t2), zero
+; RV32-NEXT: vxor.vv v16, v16, v28
+; RV32-NEXT: vlse64.v v28, (t1), zero
+; RV32-NEXT: vxor.vv v16, v16, v29
+; RV32-NEXT: vlse64.v v29, (t0), zero
+; RV32-NEXT: vxor.vv v16, v16, v30
+; RV32-NEXT: vlse64.v v30, (a7), zero
+; RV32-NEXT: vxor.vv v16, v16, v31
+; RV32-NEXT: vlse64.v v31, (a6), zero
+; RV32-NEXT: vxor.vv v16, v16, v7
+; RV32-NEXT: vlse64.v v7, (a5), zero
+; RV32-NEXT: vxor.vv v16, v16, v6
+; RV32-NEXT: vlse64.v v6, (a4), zero
+; RV32-NEXT: vxor.vv v16, v16, v5
+; RV32-NEXT: vlse64.v v5, (a3), zero
+; RV32-NEXT: vxor.vv v16, v16, v4
+; RV32-NEXT: vlse64.v v4, (a2), zero
+; RV32-NEXT: vxor.vv v16, v16, v3
+; RV32-NEXT: vlse64.v v3, (a1), zero
+; RV32-NEXT: vxor.vv v16, v16, v2
+; RV32-NEXT: vlse64.v v2, (s11), zero
+; RV32-NEXT: vxor.vv v1, v16, v1
+; RV32-NEXT: vlse64.v v10, (ra), zero
+; RV32-NEXT: vand.vv v13, v9, v13
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v14, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v14, v9, v14
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v15, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v15, v9, v15
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v16, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v9, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v17, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v17, v9, v17
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v18, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v18, v9, v18
+; RV32-NEXT: vand.vv v19, v9, v11
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v11, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v20, v9, v11
+; RV32-NEXT: addi a1, sp, 288
+; RV32-NEXT: vl1r.v v11, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v11, v9, v11
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v11, v9, v12
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v0, v9, v0
+; RV32-NEXT: vand.vv v21, v9, v21
+; RV32-NEXT: vand.vv v22, v9, v22
+; RV32-NEXT: vand.vv v23, v9, v23
+; RV32-NEXT: vand.vv v24, v9, v24
+; RV32-NEXT: vand.vv v25, v9, v25
+; RV32-NEXT: vand.vv v26, v9, v26
+; RV32-NEXT: vand.vv v27, v9, v27
+; RV32-NEXT: vand.vv v28, v9, v28
+; RV32-NEXT: vand.vv v29, v9, v29
+; RV32-NEXT: vand.vv v30, v9, v30
+; RV32-NEXT: vand.vv v31, v9, v31
+; RV32-NEXT: vand.vv v7, v9, v7
+; RV32-NEXT: vand.vv v6, v9, v6
+; RV32-NEXT: vand.vv v5, v9, v5
+; RV32-NEXT: vand.vv v4, v9, v4
+; RV32-NEXT: vand.vv v11, v9, v3
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v2, v9, v2
+; RV32-NEXT: vand.vv v10, v9, v10
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v10, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vlse64.v v10, (s10), zero
+; RV32-NEXT: vlse64.v v3, (t5), zero
+; RV32-NEXT: vlse64.v v11, (s8), zero
+; RV32-NEXT: vlse64.v v12, (s7), zero
+; RV32-NEXT: vand.vv v10, v9, v10
+; RV32-NEXT: vand.vv v3, v9, v3
+; RV32-NEXT: vand.vv v11, v9, v11
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v9, v12
+; RV32-NEXT: vand.vx v9, v9, a0
+; RV32-NEXT: vmul.vv v9, v8, v9
+; RV32-NEXT: vxor.vv v9, v1, v9
+; RV32-NEXT: vmul.vv v11, v8, v13
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v14
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v15
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v16
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v17
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v18
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v19
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v20
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 1
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v0
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v21
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v22
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v23
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v24
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v25
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v26
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v27
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v28
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v29
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v30
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v31
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v7
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v6
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v5
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v4
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v2
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v9, v9, v10
+; RV32-NEXT: vmul.vv v10, v8, v3
+; RV32-NEXT: vxor.vv v9, v9, v10
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 2
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v9, v9, v10
+; RV32-NEXT: vmul.vv v8, v8, v12
+; RV32-NEXT: vxor.vv v8, v9, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 3
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 348(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 344(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 340(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 336(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 332(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 328(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 324(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 320(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 316(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 312(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 308(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 304(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 300(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 352
+; RV32-NEXT: ret
+;
+; RV64-LABEL: clmul_nxv1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT: vand.vi v10, v9, 2
+; RV64-NEXT: vand.vi v11, v9, 1
+; RV64-NEXT: vmul.vv v10, v8, v10
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v11, v10
+; RV64-NEXT: vand.vi v11, v9, 4
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vi v11, v9, 8
+; RV64-NEXT: li a0, 16
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: li a0, 64
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: li a0, 256
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a1, 512
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: li a2, 1024
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a2
+; RV64-NEXT: slli a1, a0, 11
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 2
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 16
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 32
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 64
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 128
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 256
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 512
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 1024
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 2048
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 4096
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 8192
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 16384
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 32768
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 65536
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 131072
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 262144
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 31
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 32
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 33
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 34
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 35
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 36
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 37
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 38
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 39
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 40
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 41
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 42
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 43
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 44
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 45
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 46
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 47
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 49
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 50
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 51
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 52
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 53
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 54
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 55
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 56
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 57
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 58
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 59
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 60
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 61
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: slli a0, a0, 62
+; RV64-NEXT: slli a1, a1, 63
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vmul.vv v8, v8, v9
+; RV64-NEXT: vxor.vv v8, v10, v8
+; RV64-NEXT: ret
+ %a = call <vscale x 1 x i64> @llvm.clmul.nxv1i64(<vscale x 1 x i64> %x, <vscale x 1 x i64> %y)
+ ret <vscale x 1 x i64> %a
+}
+
+define <vscale x 2 x i64> @clmul_nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) nounwind {
+; RV32-LABEL: clmul_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -352
+; RV32-NEXT: sw ra, 348(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 344(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 340(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 336(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 332(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 328(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 324(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 320(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 316(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 312(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 308(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 304(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 300(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: lui a1, 524288
+; RV32-NEXT: li s2, 1
+; RV32-NEXT: li a3, 2
+; RV32-NEXT: li a2, 4
+; RV32-NEXT: li s7, 8
+; RV32-NEXT: li a0, 16
+; RV32-NEXT: li s6, 32
+; RV32-NEXT: li s5, 64
+; RV32-NEXT: li s4, 128
+; RV32-NEXT: li s1, 256
+; RV32-NEXT: li s0, 512
+; RV32-NEXT: li t5, 1024
+; RV32-NEXT: lui ra, 1
+; RV32-NEXT: lui s8, 2
+; RV32-NEXT: lui s10, 4
+; RV32-NEXT: lui s11, 8
+; RV32-NEXT: lui s9, 16
+; RV32-NEXT: lui s3, 32
+; RV32-NEXT: lui t6, 64
+; RV32-NEXT: lui t4, 128
+; RV32-NEXT: lui t3, 256
+; RV32-NEXT: lui t2, 512
+; RV32-NEXT: lui t1, 1024
+; RV32-NEXT: lui t0, 2048
+; RV32-NEXT: lui a7, 4096
+; RV32-NEXT: lui a6, 8192
+; RV32-NEXT: lui a5, 16384
+; RV32-NEXT: lui a4, 32768
+; RV32-NEXT: sw a1, 272(sp)
+; RV32-NEXT: sw zero, 276(sp)
+; RV32-NEXT: sw zero, 264(sp)
+; RV32-NEXT: sw s2, 268(sp)
+; RV32-NEXT: sw zero, 256(sp)
+; RV32-NEXT: sw a3, 260(sp)
+; RV32-NEXT: lui a3, 65536
+; RV32-NEXT: sw zero, 248(sp)
+; RV32-NEXT: sw a2, 252(sp)
+; RV32-NEXT: lui a2, 131072
+; RV32-NEXT: sw zero, 240(sp)
+; RV32-NEXT: sw s7, 244(sp)
+; RV32-NEXT: vsetvli s7, zero, e64, m2, ta, ma
+; RV32-NEXT: vand.vi v28, v10, 2
+; RV32-NEXT: vand.vi v20, v10, 1
+; RV32-NEXT: vand.vi v30, v10, 4
+; RV32-NEXT: vand.vi v14, v10, 8
+; RV32-NEXT: sw zero, 232(sp)
+; RV32-NEXT: sw a0, 236(sp)
+; RV32-NEXT: vand.vx v12, v10, a0
+; RV32-NEXT: addi s7, sp, 272
+; RV32-NEXT: sw zero, 224(sp)
+; RV32-NEXT: sw s6, 228(sp)
+; RV32-NEXT: vand.vx v16, v10, s6
+; RV32-NEXT: addi s6, sp, 264
+; RV32-NEXT: sw zero, 216(sp)
+; RV32-NEXT: sw s5, 220(sp)
+; RV32-NEXT: vand.vx v18, v10, s5
+; RV32-NEXT: addi s5, sp, 256
+; RV32-NEXT: sw zero, 208(sp)
+; RV32-NEXT: sw s4, 212(sp)
+; RV32-NEXT: vand.vx v0, v10, s4
+; RV32-NEXT: addi s4, sp, 248
+; RV32-NEXT: sw zero, 200(sp)
+; RV32-NEXT: sw s1, 204(sp)
+; RV32-NEXT: vand.vx v6, v10, s1
+; RV32-NEXT: addi s1, sp, 240
+; RV32-NEXT: sw zero, 192(sp)
+; RV32-NEXT: sw s0, 196(sp)
+; RV32-NEXT: vand.vx v4, v10, s0
+; RV32-NEXT: addi s0, sp, 232
+; RV32-NEXT: sw zero, 184(sp)
+; RV32-NEXT: sw t5, 188(sp)
+; RV32-NEXT: vand.vx v2, v10, t5
+; RV32-NEXT: slli s2, s2, 11
+; RV32-NEXT: vand.vx v24, v10, ra
+; RV32-NEXT: sw zero, 176(sp)
+; RV32-NEXT: sw s2, 180(sp)
+; RV32-NEXT: sw zero, 168(sp)
+; RV32-NEXT: sw ra, 172(sp)
+; RV32-NEXT: addi t5, sp, 216
+; RV32-NEXT: vand.vx v26, v10, s8
+; RV32-NEXT: sw zero, 160(sp)
+; RV32-NEXT: sw s8, 164(sp)
+; RV32-NEXT: addi s8, sp, 208
+; RV32-NEXT: vand.vx v22, v10, s10
+; RV32-NEXT: sw zero, 152(sp)
+; RV32-NEXT: sw s10, 156(sp)
+; RV32-NEXT: addi s10, sp, 200
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vi v20, v20, 0
+; RV32-NEXT: vxor.vv v20, v20, v28
+; RV32-NEXT: vand.vx v28, v10, s11
+; RV32-NEXT: sw zero, 144(sp)
+; RV32-NEXT: sw s11, 148(sp)
+; RV32-NEXT: addi s11, sp, 192
+; RV32-NEXT: vmul.vv v30, v8, v30
+; RV32-NEXT: vxor.vv v20, v20, v30
+; RV32-NEXT: vand.vx v30, v10, s9
+; RV32-NEXT: sw zero, 136(sp)
+; RV32-NEXT: sw s9, 140(sp)
+; RV32-NEXT: addi s9, sp, 184
+; RV32-NEXT: vmul.vv v14, v8, v14
+; RV32-NEXT: vxor.vv v14, v20, v14
+; RV32-NEXT: vand.vx v20, v10, s3
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv ra, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, ra
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs2r.v v20, (a0) # vscale x 16-byte Folded Spill
+; RV32-NEXT: sw zero, 128(sp)
+; RV32-NEXT: sw s3, 132(sp)
+; RV32-NEXT: addi s3, sp, 176
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v12, v14, v12
+; RV32-NEXT: vand.vx v14, v10, t6
+; RV32-NEXT: sw zero, 120(sp)
+; RV32-NEXT: sw t6, 124(sp)
+; RV32-NEXT: addi t6, sp, 168
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: vand.vx v16, v10, t4
+; RV32-NEXT: sw zero, 112(sp)
+; RV32-NEXT: sw t4, 116(sp)
+; RV32-NEXT: addi t4, sp, 160
+; RV32-NEXT: vmul.vv v18, v8, v18
+; RV32-NEXT: vxor.vv v18, v12, v18
+; RV32-NEXT: vand.vx v12, v10, t3
+; RV32-NEXT: sw zero, 104(sp)
+; RV32-NEXT: sw t3, 108(sp)
+; RV32-NEXT: addi t3, sp, 152
+; RV32-NEXT: vmul.vv v20, v8, v0
+; RV32-NEXT: vxor.vv v18, v18, v20
+; RV32-NEXT: vand.vx v20, v10, t2
+; RV32-NEXT: sw zero, 96(sp)
+; RV32-NEXT: sw t2, 100(sp)
+; RV32-NEXT: addi t2, sp, 144
+; RV32-NEXT: vmul.vv v6, v8, v6
+; RV32-NEXT: vxor.vv v18, v18, v6
+; RV32-NEXT: vand.vx v6, v10, t1
+; RV32-NEXT: sw zero, 88(sp)
+; RV32-NEXT: sw t1, 92(sp)
+; RV32-NEXT: addi t1, sp, 136
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vxor.vv v18, v18, v4
+; RV32-NEXT: vand.vx v4, v10, t0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv ra, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add ra, ra, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, ra
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw t0, 84(sp)
+; RV32-NEXT: addi t0, sp, 128
+; RV32-NEXT: vmul.vv v2, v8, v2
+; RV32-NEXT: vxor.vv v18, v18, v2
+; RV32-NEXT: vand.vx v2, v10, s2
+; RV32-NEXT: addi ra, sp, 120
+; RV32-NEXT: vmul.vv v2, v8, v2
+; RV32-NEXT: vxor.vv v18, v18, v2
+; RV32-NEXT: vand.vx v2, v10, a7
+; RV32-NEXT: sw zero, 72(sp)
+; RV32-NEXT: sw a7, 76(sp)
+; RV32-NEXT: addi a7, sp, 112
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v18, v18, v24
+; RV32-NEXT: vand.vx v4, v10, a6
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw a6, 68(sp)
+; RV32-NEXT: addi a6, sp, 104
+; RV32-NEXT: vmul.vv v26, v8, v26
+; RV32-NEXT: vxor.vv v18, v18, v26
+; RV32-NEXT: vand.vx v26, v10, a5
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw a5, 60(sp)
+; RV32-NEXT: addi a5, sp, 96
+; RV32-NEXT: vmul.vv v22, v8, v22
+; RV32-NEXT: vxor.vv v18, v18, v22
+; RV32-NEXT: vand.vx v24, v10, a4
+; RV32-NEXT: sw zero, 48(sp)
+; RV32-NEXT: sw a4, 52(sp)
+; RV32-NEXT: addi a4, sp, 88
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vxor.vv v18, v18, v28
+; RV32-NEXT: vand.vx v28, v10, a3
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw a3, 44(sp)
+; RV32-NEXT: addi a3, sp, 80
+; RV32-NEXT: vmul.vv v30, v8, v30
+; RV32-NEXT: vxor.vv v18, v18, v30
+; RV32-NEXT: vand.vx v30, v10, a2
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: addi a2, sp, 72
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: lui a0, 262144
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: addi a1, sp, 64
+; RV32-NEXT: sw a6, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 3
+; RV32-NEXT: mv s2, a6
+; RV32-NEXT: slli a6, a6, 2
+; RV32-NEXT: add a6, a6, s2
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vl2r.v v22, (a6) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v0, v8, v22
+; RV32-NEXT: vxor.vv v0, v18, v0
+; RV32-NEXT: vlse64.v v18, (s7), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 3
+; RV32-NEXT: mv s2, a6
+; RV32-NEXT: slli a6, a6, 2
+; RV32-NEXT: add a6, a6, s2
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v18, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s7, sp, 56
+; RV32-NEXT: vmul.vv v14, v8, v14
+; RV32-NEXT: vxor.vv v14, v0, v14
+; RV32-NEXT: vlse64.v v18, (s6), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 2
+; RV32-NEXT: mv s2, a6
+; RV32-NEXT: slli a6, a6, 3
+; RV32-NEXT: add a6, a6, s2
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v18, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s2, sp, 48
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v14, v14, v16
+; RV32-NEXT: vlse64.v v16, (s5), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: mv s5, a6
+; RV32-NEXT: slli a6, a6, 4
+; RV32-NEXT: add a6, a6, s5
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v16, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s5, sp, 40
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v12, v14, v12
+; RV32-NEXT: vlse64.v v14, (s4), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 5
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v14, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s4, sp, 32
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v20, v12, v20
+; RV32-NEXT: vlse64.v v12, (s1), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: mv s1, a6
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: add s1, s1, a6
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: add s1, s1, a6
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: add a6, a6, s1
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v12, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s1, sp, 24
+; RV32-NEXT: vmul.vv v6, v8, v6
+; RV32-NEXT: vxor.vv v20, v20, v6
+; RV32-NEXT: vlse64.v v12, (s0), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 2
+; RV32-NEXT: mv s0, a6
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: add s0, s0, a6
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: add a6, a6, s0
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v12, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 16
+; RV32-NEXT: csrr s6, vlenb
+; RV32-NEXT: slli s6, s6, 1
+; RV32-NEXT: mv a6, s6
+; RV32-NEXT: slli s6, s6, 1
+; RV32-NEXT: add a6, a6, s6
+; RV32-NEXT: slli s6, s6, 3
+; RV32-NEXT: add s6, s6, a6
+; RV32-NEXT: lw a6, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT: add s6, sp, s6
+; RV32-NEXT: addi s6, s6, 288
+; RV32-NEXT: vl2r.v v12, (s6) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v6, v8, v12
+; RV32-NEXT: vmul.vv v2, v8, v2
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vmul.vv v26, v8, v26
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vmul.vv v30, v8, v30
+; RV32-NEXT: vxor.vv v20, v20, v6
+; RV32-NEXT: addi s6, sp, 224
+; RV32-NEXT: vlse64.v v0, (s6), zero
+; RV32-NEXT: vxor.vv v20, v20, v2
+; RV32-NEXT: vlse64.v v6, (t5), zero
+; RV32-NEXT: vxor.vv v20, v20, v4
+; RV32-NEXT: vlse64.v v22, (s8), zero
+; RV32-NEXT: vxor.vv v20, v20, v26
+; RV32-NEXT: vlse64.v v18, (s10), zero
+; RV32-NEXT: vxor.vv v20, v20, v24
+; RV32-NEXT: vlse64.v v16, (s11), zero
+; RV32-NEXT: vxor.vv v20, v20, v28
+; RV32-NEXT: vlse64.v v14, (s9), zero
+; RV32-NEXT: vxor.vv v2, v20, v30
+; RV32-NEXT: vlse64.v v12, (s3), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 3
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v26, v10, v20
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 3
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v4, v10, v20
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 4
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v30, v10, v20
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 5
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v20, v10, v20
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add s3, s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add s3, s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v24, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v28, v10, v24
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add s3, s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v24, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v10, v24
+; RV32-NEXT: vand.vv v0, v10, v0
+; RV32-NEXT: vand.vv v6, v10, v6
+; RV32-NEXT: vand.vv v22, v10, v22
+; RV32-NEXT: vand.vv v18, v10, v18
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs2r.v v18, (t5) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v10, v16
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs2r.v v16, (t5) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v14, v10, v14
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add s3, s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add s3, s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs2r.v v14, (t5) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v12
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 3
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs2r.v v12, (t5) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vlse64.v v12, (t6), zero
+; RV32-NEXT: vlse64.v v14, (t4), zero
+; RV32-NEXT: vlse64.v v16, (t3), zero
+; RV32-NEXT: vlse64.v v18, (t2), zero
+; RV32-NEXT: vand.vv v12, v10, v12
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v14
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v16
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 2
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t3, t3, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v18
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t3, t3, t2
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vlse64.v v12, (t1), zero
+; RV32-NEXT: vlse64.v v14, (t0), zero
+; RV32-NEXT: vlse64.v v16, (ra), zero
+; RV32-NEXT: vlse64.v v18, (a7), zero
+; RV32-NEXT: vand.vv v12, v10, v12
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v14
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 4
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v16
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 1
+; RV32-NEXT: mv t0, a7
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: add t0, t0, a7
+; RV32-NEXT: slli a7, a7, 1
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v18
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: mv t0, a7
+; RV32-NEXT: slli a7, a7, 3
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vlse64.v v12, (a6), zero
+; RV32-NEXT: vlse64.v v14, (a5), zero
+; RV32-NEXT: vlse64.v v16, (a4), zero
+; RV32-NEXT: vlse64.v v18, (a3), zero
+; RV32-NEXT: vand.vv v12, v10, v12
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 288
+; RV32-NEXT: vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v14
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: mv a4, a3
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: add a4, a4, a3
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 288
+; RV32-NEXT: vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v16
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: mv a4, a3
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 288
+; RV32-NEXT: vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v18
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: mv a4, a3
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 288
+; RV32-NEXT: vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vlse64.v v12, (a2), zero
+; RV32-NEXT: vlse64.v v14, (a1), zero
+; RV32-NEXT: vlse64.v v16, (s7), zero
+; RV32-NEXT: vlse64.v v18, (s2), zero
+; RV32-NEXT: vand.vv v12, v10, v12
+; RV32-NEXT: addi a1, sp, 288
+; RV32-NEXT: vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v14
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v18
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vlse64.v v14, (s5), zero
+; RV32-NEXT: vlse64.v v16, (s4), zero
+; RV32-NEXT: vlse64.v v18, (s1), zero
+; RV32-NEXT: vlse64.v v12, (s0), zero
+; RV32-NEXT: vand.vv v14, v10, v14
+; RV32-NEXT: vand.vv v16, v10, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v18, v10, v18
+; RV32-NEXT: vand.vv v16, v10, v12
+; RV32-NEXT: vand.vx v10, v10, a0
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v10, v2, v10
+; RV32-NEXT: vmul.vv v12, v8, v26
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v4
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v30
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v20
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v28
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v24
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v0
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v6
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v22
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: addi a0, sp, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v14
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v18
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: vxor.vv v8, v10, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 348(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 344(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 340(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 336(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 332(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 328(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 324(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 320(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 316(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 312(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 308(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 304(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 300(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 352
+; RV32-NEXT: ret
+;
+; RV64-LABEL: clmul_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; RV64-NEXT: vand.vi v12, v10, 2
+; RV64-NEXT: vand.vi v14, v10, 1
+; RV64-NEXT: vmul.vv v12, v8, v12
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v14, v12
+; RV64-NEXT: vand.vi v14, v10, 4
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vi v14, v10, 8
+; RV64-NEXT: li a0, 16
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a0
+; RV64-NEXT: li a0, 64
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a0
+; RV64-NEXT: li a0, 256
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: li a1, 512
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a0
+; RV64-NEXT: li a2, 1024
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a2
+; RV64-NEXT: slli a1, a0, 11
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 2
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 16
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 32
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 64
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 128
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 256
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 512
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 1024
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 2048
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 4096
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 8192
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 16384
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 32768
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 65536
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 131072
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 262144
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 31
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 32
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 33
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 34
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 35
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 36
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 37
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 38
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 39
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 40
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 41
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 42
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 43
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 44
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 45
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 46
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 47
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 49
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 50
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 51
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 52
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 53
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 54
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 55
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 56
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 57
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 58
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 59
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 60
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 61
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: slli a0, a0, 62
+; RV64-NEXT: slli a1, a1, 63
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a0
+; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vmul.vv v8, v8, v10
+; RV64-NEXT: vxor.vv v8, v12, v8
+; RV64-NEXT: ret
+ %a = call <vscale x 2 x i64> @llvm.clmul.nxv2i64(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y)
+ ret <vscale x 2 x i64> %a
+}
+
+define <vscale x 4 x i64> @clmul_nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y) nounwind {
+; RV32-LABEL: clmul_nxv4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -352
+; RV32-NEXT: sw ra, 348(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 344(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 340(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 336(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 332(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 328(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 324(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 320(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 316(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 312(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 308(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 304(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 300(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: lui a1, 524288
+; RV32-NEXT: li s4, 1
+; RV32-NEXT: li a3, 2
+; RV32-NEXT: li a2, 4
+; RV32-NEXT: li a0, 8
+; RV32-NEXT: li s3, 16
+; RV32-NEXT: li s2, 32
+; RV32-NEXT: li s5, 64
+; RV32-NEXT: li s6, 128
+; RV32-NEXT: li s8, 256
+; RV32-NEXT: li s1, 512
+; RV32-NEXT: li s7, 1024
+; RV32-NEXT: lui ra, 1
+; RV32-NEXT: lui s11, 2
+; RV32-NEXT: lui s10, 4
+; RV32-NEXT: lui s9, 8
+; RV32-NEXT: lui s0, 16
+; RV32-NEXT: lui t6, 32
+; RV32-NEXT: lui t5, 64
+; RV32-NEXT: lui t4, 128
+; RV32-NEXT: lui t3, 256
+; RV32-NEXT: lui t2, 512
+; RV32-NEXT: lui t1, 1024
+; RV32-NEXT: lui t0, 2048
+; RV32-NEXT: lui a7, 4096
+; RV32-NEXT: lui a6, 8192
+; RV32-NEXT: lui a5, 16384
+; RV32-NEXT: lui a4, 32768
+; RV32-NEXT: sw a1, 272(sp)
+; RV32-NEXT: sw zero, 276(sp)
+; RV32-NEXT: sw zero, 264(sp)
+; RV32-NEXT: sw s4, 268(sp)
+; RV32-NEXT: sw zero, 256(sp)
+; RV32-NEXT: sw a3, 260(sp)
+; RV32-NEXT: lui a3, 65536
+; RV32-NEXT: sw zero, 248(sp)
+; RV32-NEXT: sw a2, 252(sp)
+; RV32-NEXT: lui a2, 131072
+; RV32-NEXT: sw zero, 240(sp)
+; RV32-NEXT: sw a0, 244(sp)
+; RV32-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV32-NEXT: vand.vi v28, v12, 2
+; RV32-NEXT: vand.vi v4, v12, 1
+; RV32-NEXT: vand.vi v24, v12, 4
+; RV32-NEXT: vand.vi v20, v12, 8
+; RV32-NEXT: sw zero, 232(sp)
+; RV32-NEXT: sw s3, 236(sp)
+; RV32-NEXT: vand.vx v16, v12, s3
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: addi s3, sp, 272
+; RV32-NEXT: sw zero, 224(sp)
+; RV32-NEXT: sw s2, 228(sp)
+; RV32-NEXT: vand.vx v0, v12, s2
+; RV32-NEXT: addi s2, sp, 264
+; RV32-NEXT: sw zero, 216(sp)
+; RV32-NEXT: sw s5, 220(sp)
+; RV32-NEXT: vmul.vv v16, v8, v28
+; RV32-NEXT: vmul.vv v28, v8, v4
+; RV32-NEXT: vxor.vi v28, v28, 0
+; RV32-NEXT: vxor.vv v28, v28, v16
+; RV32-NEXT: vand.vx v16, v12, s5
+; RV32-NEXT: addi s5, sp, 256
+; RV32-NEXT: sw zero, 208(sp)
+; RV32-NEXT: sw s6, 212(sp)
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v28, v28, v24
+; RV32-NEXT: vand.vx v24, v12, s6
+; RV32-NEXT: addi s6, sp, 248
+; RV32-NEXT: sw zero, 200(sp)
+; RV32-NEXT: sw s8, 204(sp)
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v20, v28, v20
+; RV32-NEXT: vand.vx v28, v12, s8
+; RV32-NEXT: addi s8, sp, 240
+; RV32-NEXT: sw zero, 192(sp)
+; RV32-NEXT: sw s1, 196(sp)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vxor.vv v20, v20, v4
+; RV32-NEXT: vand.vx v4, v12, s1
+; RV32-NEXT: sw zero, 184(sp)
+; RV32-NEXT: sw s7, 188(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v20, v20, v0
+; RV32-NEXT: vand.vx v0, v12, s7
+; RV32-NEXT: slli a0, s4, 11
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v20, v20, v16
+; RV32-NEXT: vand.vx v16, v12, ra
+; RV32-NEXT: sw zero, 176(sp)
+; RV32-NEXT: sw a0, 180(sp)
+; RV32-NEXT: sw zero, 168(sp)
+; RV32-NEXT: sw ra, 172(sp)
+; RV32-NEXT: addi s4, sp, 216
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v24, v20, v24
+; RV32-NEXT: vand.vx v20, v12, s11
+; RV32-NEXT: sw zero, 160(sp)
+; RV32-NEXT: sw s11, 164(sp)
+; RV32-NEXT: addi s11, sp, 208
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vxor.vv v28, v24, v28
+; RV32-NEXT: vand.vx v24, v12, s10
+; RV32-NEXT: sw zero, 152(sp)
+; RV32-NEXT: sw s10, 156(sp)
+; RV32-NEXT: addi s10, sp, 200
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vxor.vv v4, v28, v4
+; RV32-NEXT: vand.vx v28, v12, s9
+; RV32-NEXT: sw zero, 144(sp)
+; RV32-NEXT: sw s9, 148(sp)
+; RV32-NEXT: addi s9, sp, 192
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v4, v4, v0
+; RV32-NEXT: vand.vx v0, v12, a0
+; RV32-NEXT: addi ra, sp, 184
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v0, v4, v0
+; RV32-NEXT: vand.vx v4, v12, s0
+; RV32-NEXT: sw zero, 136(sp)
+; RV32-NEXT: sw s0, 140(sp)
+; RV32-NEXT: addi s1, sp, 176
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v0, v0, v16
+; RV32-NEXT: vand.vx v16, v12, t6
+; RV32-NEXT: sw zero, 128(sp)
+; RV32-NEXT: sw t6, 132(sp)
+; RV32-NEXT: addi s0, sp, 168
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v0, v0, v20
+; RV32-NEXT: vand.vx v20, v12, t5
+; RV32-NEXT: sw zero, 120(sp)
+; RV32-NEXT: sw t5, 124(sp)
+; RV32-NEXT: addi t6, sp, 160
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v0, v0, v24
+; RV32-NEXT: vand.vx v24, v12, t4
+; RV32-NEXT: sw zero, 112(sp)
+; RV32-NEXT: sw t4, 116(sp)
+; RV32-NEXT: addi t5, sp, 152
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vxor.vv v0, v0, v28
+; RV32-NEXT: vand.vx v28, v12, t3
+; RV32-NEXT: sw zero, 104(sp)
+; RV32-NEXT: sw t3, 108(sp)
+; RV32-NEXT: addi t4, sp, 144
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vxor.vv v0, v0, v4
+; RV32-NEXT: vand.vx v4, v12, t2
+; RV32-NEXT: sw zero, 96(sp)
+; RV32-NEXT: sw t2, 100(sp)
+; RV32-NEXT: addi t3, sp, 136
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v16, v0, v16
+; RV32-NEXT: vand.vx v0, v12, t1
+; RV32-NEXT: sw zero, 88(sp)
+; RV32-NEXT: sw t1, 92(sp)
+; RV32-NEXT: addi t2, sp, 128
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: vand.vx v16, v12, t0
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw t0, 84(sp)
+; RV32-NEXT: addi t1, sp, 120
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v24, v20, v24
+; RV32-NEXT: vand.vx v20, v12, a7
+; RV32-NEXT: sw zero, 72(sp)
+; RV32-NEXT: sw a7, 76(sp)
+; RV32-NEXT: addi t0, sp, 112
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vxor.vv v24, v24, v28
+; RV32-NEXT: vand.vx v28, v12, a6
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw a6, 68(sp)
+; RV32-NEXT: addi a7, sp, 104
+; RV32-NEXT: vmul.vv v28, v8, v4
+; RV32-NEXT: vxor.vv v24, v24, v28
+; RV32-NEXT: vand.vx v28, v12, a5
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw a5, 60(sp)
+; RV32-NEXT: addi a6, sp, 96
+; RV32-NEXT: vmul.vv v28, v8, v0
+; RV32-NEXT: vxor.vv v28, v24, v28
+; RV32-NEXT: vand.vx v24, v12, a4
+; RV32-NEXT: sw zero, 48(sp)
+; RV32-NEXT: sw a4, 52(sp)
+; RV32-NEXT: addi a5, sp, 88
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v16, v28, v16
+; RV32-NEXT: vand.vx v28, v12, a3
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw a3, 44(sp)
+; RV32-NEXT: addi a4, sp, 80
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v16, v16, v20
+; RV32-NEXT: vand.vx v4, v12, a2
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: addi a3, sp, 72
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: lui a1, 262144
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: lui a0, 524288
+; RV32-NEXT: sw a0, 20(sp)
+; RV32-NEXT: addi a2, sp, 64
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: vlse64.v v16, (s3), zero
+; RV32-NEXT: addi s3, sp, 56
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v0, v20, v0
+; RV32-NEXT: vlse64.v v20, (s2), zero
+; RV32-NEXT: addi s2, sp, 48
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v0, v0, v24
+; RV32-NEXT: vlse64.v v24, (s5), zero
+; RV32-NEXT: addi s5, sp, 40
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vxor.vv v0, v0, v28
+; RV32-NEXT: vlse64.v v28, (s6), zero
+; RV32-NEXT: addi s6, sp, 32
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vxor.vv v4, v0, v4
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v4, (s8), zero
+; RV32-NEXT: addi s8, sp, 24
+; RV32-NEXT: vand.vv v16, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v28
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v4
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi s7, sp, 232
+; RV32-NEXT: vlse64.v v16, (s7), zero
+; RV32-NEXT: addi s7, sp, 224
+; RV32-NEXT: vlse64.v v20, (s7), zero
+; RV32-NEXT: vlse64.v v24, (s4), zero
+; RV32-NEXT: vlse64.v v28, (s11), zero
+; RV32-NEXT: vand.vv v16, v12, v16
+; RV32-NEXT: csrr s4, vlenb
+; RV32-NEXT: slli s4, s4, 4
+; RV32-NEXT: add s4, sp, s4
+; RV32-NEXT: addi s4, s4, 288
+; RV32-NEXT: vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr s4, vlenb
+; RV32-NEXT: slli s4, s4, 2
+; RV32-NEXT: mv s7, s4
+; RV32-NEXT: slli s4, s4, 1
+; RV32-NEXT: add s7, s7, s4
+; RV32-NEXT: slli s4, s4, 2
+; RV32-NEXT: add s4, s4, s7
+; RV32-NEXT: add s4, sp, s4
+; RV32-NEXT: addi s4, s4, 288
+; RV32-NEXT: vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr s4, vlenb
+; RV32-NEXT: slli s4, s4, 2
+; RV32-NEXT: mv s7, s4
+; RV32-NEXT: slli s4, s4, 4
+; RV32-NEXT: add s4, s4, s7
+; RV32-NEXT: add s4, sp, s4
+; RV32-NEXT: addi s4, s4, 288
+; RV32-NEXT: vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v28
+; RV32-NEXT: csrr s4, vlenb
+; RV32-NEXT: slli s4, s4, 2
+; RV32-NEXT: mv s7, s4
+; RV32-NEXT: slli s4, s4, 1
+; RV32-NEXT: add s7, s7, s4
+; RV32-NEXT: slli s4, s4, 1
+; RV32-NEXT: add s7, s7, s4
+; RV32-NEXT: slli s4, s4, 2
+; RV32-NEXT: add s4, s4, s7
+; RV32-NEXT: add s4, sp, s4
+; RV32-NEXT: addi s4, s4, 288
+; RV32-NEXT: vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v20, (s10), zero
+; RV32-NEXT: vlse64.v v24, (s9), zero
+; RV32-NEXT: vlse64.v v28, (ra), zero
+; RV32-NEXT: vlse64.v v4, (s1), zero
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: slli s1, s1, 2
+; RV32-NEXT: mv s4, s1
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: add s1, s1, s4
+; RV32-NEXT: add s1, sp, s1
+; RV32-NEXT: addi s1, s1, 288
+; RV32-NEXT: vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: slli s1, s1, 3
+; RV32-NEXT: mv s4, s1
+; RV32-NEXT: slli s1, s1, 2
+; RV32-NEXT: add s1, s1, s4
+; RV32-NEXT: add s1, sp, s1
+; RV32-NEXT: addi s1, s1, 288
+; RV32-NEXT: vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v28
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: slli s1, s1, 6
+; RV32-NEXT: add s1, sp, s1
+; RV32-NEXT: addi s1, s1, 288
+; RV32-NEXT: vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v4
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: slli s1, s1, 3
+; RV32-NEXT: mv s4, s1
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: add s4, s4, s1
+; RV32-NEXT: slli s1, s1, 2
+; RV32-NEXT: add s1, s1, s4
+; RV32-NEXT: add s1, sp, s1
+; RV32-NEXT: addi s1, s1, 288
+; RV32-NEXT: vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (s0), zero
+; RV32-NEXT: vlse64.v v28, (t6), zero
+; RV32-NEXT: vlse64.v v4, (t5), zero
+; RV32-NEXT: vlse64.v v0, (t4), zero
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr t4, vlenb
+; RV32-NEXT: slli t4, t4, 3
+; RV32-NEXT: add t4, sp, t4
+; RV32-NEXT: addi t4, t4, 288
+; RV32-NEXT: vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v28
+; RV32-NEXT: csrr t4, vlenb
+; RV32-NEXT: slli t4, t4, 2
+; RV32-NEXT: mv t5, t4
+; RV32-NEXT: slli t4, t4, 3
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: add t4, sp, t4
+; RV32-NEXT: addi t4, t4, 288
+; RV32-NEXT: vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v4
+; RV32-NEXT: csrr t4, vlenb
+; RV32-NEXT: slli t4, t4, 2
+; RV32-NEXT: mv t5, t4
+; RV32-NEXT: slli t4, t4, 1
+; RV32-NEXT: add t5, t5, t4
+; RV32-NEXT: slli t4, t4, 1
+; RV32-NEXT: add t5, t5, t4
+; RV32-NEXT: slli t4, t4, 1
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: add t4, sp, t4
+; RV32-NEXT: addi t4, t4, 288
+; RV32-NEXT: vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v0
+; RV32-NEXT: csrr t4, vlenb
+; RV32-NEXT: slli t4, t4, 2
+; RV32-NEXT: mv t5, t4
+; RV32-NEXT: slli t4, t4, 2
+; RV32-NEXT: add t5, t5, t4
+; RV32-NEXT: slli t4, t4, 2
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: add t4, sp, t4
+; RV32-NEXT: addi t4, t4, 288
+; RV32-NEXT: vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v28, (t3), zero
+; RV32-NEXT: vlse64.v v4, (t2), zero
+; RV32-NEXT: vlse64.v v0, (t1), zero
+; RV32-NEXT: vlse64.v v16, (t0), zero
+; RV32-NEXT: vand.vv v20, v12, v28
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: slli t0, t0, 2
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 288
+; RV32-NEXT: vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v20, v12, v4
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: slli t0, t0, 5
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 288
+; RV32-NEXT: vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v20, v12, v0
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: slli t0, t0, 3
+; RV32-NEXT: mv t1, t0
+; RV32-NEXT: slli t0, t0, 1
+; RV32-NEXT: add t1, t1, t0
+; RV32-NEXT: slli t0, t0, 1
+; RV32-NEXT: add t0, t0, t1
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 288
+; RV32-NEXT: vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v16
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: slli t0, t0, 4
+; RV32-NEXT: mv t1, t0
+; RV32-NEXT: slli t0, t0, 2
+; RV32-NEXT: add t0, t0, t1
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 288
+; RV32-NEXT: vs4r.v v16, (t0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v16, (a7), zero
+; RV32-NEXT: vlse64.v v0, (a6), zero
+; RV32-NEXT: vlse64.v v20, (a5), zero
+; RV32-NEXT: vlse64.v v24, (a4), zero
+; RV32-NEXT: vand.vv v4, v12, v16
+; RV32-NEXT: vand.vv v16, v12, v0
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 2
+; RV32-NEXT: mv a5, a4
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: add a5, a5, a4
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 288
+; RV32-NEXT: vs4r.v v16, (a4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 2
+; RV32-NEXT: mv a5, a4
+; RV32-NEXT: slli a4, a4, 2
+; RV32-NEXT: add a5, a5, a4
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 288
+; RV32-NEXT: vs4r.v v16, (a4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 2
+; RV32-NEXT: mv a5, a4
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: add a5, a5, a4
+; RV32-NEXT: slli a4, a4, 3
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 288
+; RV32-NEXT: vs4r.v v16, (a4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: vlse64.v v20, (a2), zero
+; RV32-NEXT: vlse64.v v24, (s3), zero
+; RV32-NEXT: vlse64.v v28, (s2), zero
+; RV32-NEXT: vand.vv v0, v12, v16
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 288
+; RV32-NEXT: vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 288
+; RV32-NEXT: vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v28
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 288
+; RV32-NEXT: vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v16, (s5), zero
+; RV32-NEXT: vlse64.v v20, (s6), zero
+; RV32-NEXT: vlse64.v v24, (s8), zero
+; RV32-NEXT: vlse64.v v28, (a0), zero
+; RV32-NEXT: vand.vv v16, v12, v16
+; RV32-NEXT: addi a0, sp, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a2, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v24, v12, v24
+; RV32-NEXT: vand.vv v20, v12, v28
+; RV32-NEXT: vand.vx v12, v12, a1
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vxor.vv v12, v16, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 6
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: vmul.vv v16, v8, v4
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: vmul.vv v16, v8, v0
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: addi a0, sp, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: vmul.vv v16, v8, v24
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: vmul.vv v8, v8, v20
+; RV32-NEXT: vxor.vv v8, v12, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 348(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 344(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 340(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 336(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 332(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 328(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 324(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 320(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 316(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 312(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 308(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 304(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 300(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 352
+; RV32-NEXT: ret
+;
+; RV64-LABEL: clmul_nxv4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; RV64-NEXT: vand.vi v16, v12, 2
+; RV64-NEXT: vand.vi v20, v12, 1
+; RV64-NEXT: vmul.vv v16, v8, v16
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v20, v16
+; RV64-NEXT: vand.vi v20, v12, 4
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vi v20, v12, 8
+; RV64-NEXT: li a0, 16
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a0
+; RV64-NEXT: li a0, 64
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a0
+; RV64-NEXT: li a0, 256
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: li a1, 512
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a0
+; RV64-NEXT: li a2, 1024
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a2
+; RV64-NEXT: slli a1, a0, 11
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 2
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 16
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 32
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 64
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 128
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 256
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 512
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 1024
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 2048
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 4096
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 8192
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 16384
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 32768
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 65536
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 131072
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 262144
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 31
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 32
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 33
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 34
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 35
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 36
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 37
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 38
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 39
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 40
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 41
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 42
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 43
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 44
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 45
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 46
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 47
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 49
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 50
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 51
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 52
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 53
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 54
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 55
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 56
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 57
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 58
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 59
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 60
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 61
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: slli a0, a0, 62
+; RV64-NEXT: slli a1, a1, 63
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a0
+; RV64-NEXT: vand.vx v12, v12, a1
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vmul.vv v8, v8, v12
+; RV64-NEXT: vxor.vv v8, v16, v8
+; RV64-NEXT: ret
+ %a = call <vscale x 4 x i64> @llvm.clmul.nxv4i64(<vscale x 4 x i64> %x, <vscale x 4 x i64> %y)
+ ret <vscale x 4 x i64> %a
+}
+
+define <vscale x 8 x i64> @clmul_nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y) nounwind {
+; RV32-LABEL: clmul_nxv8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -352
+; RV32-NEXT: sw ra, 348(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 344(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 340(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 336(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 332(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 328(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 324(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 320(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 316(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 312(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 308(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 304(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 300(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: lui a1, 524288
+; RV32-NEXT: li s5, 1
+; RV32-NEXT: li a3, 2
+; RV32-NEXT: li a2, 4
+; RV32-NEXT: li s10, 8
+; RV32-NEXT: li a0, 16
+; RV32-NEXT: li t6, 32
+; RV32-NEXT: li s1, 64
+; RV32-NEXT: li s3, 128
+; RV32-NEXT: li s7, 256
+; RV32-NEXT: li s4, 512
+; RV32-NEXT: li s8, 1024
+; RV32-NEXT: lui ra, 1
+; RV32-NEXT: lui s11, 2
+; RV32-NEXT: lui s9, 4
+; RV32-NEXT: lui s6, 8
+; RV32-NEXT: lui s2, 16
+; RV32-NEXT: lui s0, 32
+; RV32-NEXT: lui t5, 64
+; RV32-NEXT: lui t4, 128
+; RV32-NEXT: lui t3, 256
+; RV32-NEXT: lui t2, 512
+; RV32-NEXT: lui t1, 1024
+; RV32-NEXT: lui t0, 2048
+; RV32-NEXT: lui a7, 4096
+; RV32-NEXT: lui a6, 8192
+; RV32-NEXT: lui a5, 16384
+; RV32-NEXT: lui a4, 32768
+; RV32-NEXT: sw a1, 272(sp)
+; RV32-NEXT: sw zero, 276(sp)
+; RV32-NEXT: sw zero, 264(sp)
+; RV32-NEXT: sw s5, 268(sp)
+; RV32-NEXT: sw zero, 256(sp)
+; RV32-NEXT: sw a3, 260(sp)
+; RV32-NEXT: lui a3, 65536
+; RV32-NEXT: sw zero, 248(sp)
+; RV32-NEXT: sw a2, 252(sp)
+; RV32-NEXT: lui a2, 131072
+; RV32-NEXT: sw zero, 240(sp)
+; RV32-NEXT: sw s10, 244(sp)
+; RV32-NEXT: vsetvli s10, zero, e64, m8, ta, ma
+; RV32-NEXT: vand.vi v24, v16, 2
+; RV32-NEXT: vand.vi v0, v16, 1
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vi v0, v0, 0
+; RV32-NEXT: vxor.vv v24, v0, v24
+; RV32-NEXT: vand.vi v0, v16, 4
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vi v0, v16, 8
+; RV32-NEXT: sw zero, 232(sp)
+; RV32-NEXT: sw a0, 236(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, a0
+; RV32-NEXT: addi s10, sp, 272
+; RV32-NEXT: sw zero, 224(sp)
+; RV32-NEXT: sw t6, 228(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, t6
+; RV32-NEXT: sw zero, 216(sp)
+; RV32-NEXT: sw s1, 220(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s1
+; RV32-NEXT: sw zero, 208(sp)
+; RV32-NEXT: sw s3, 212(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s3
+; RV32-NEXT: sw zero, 200(sp)
+; RV32-NEXT: sw s7, 204(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s7
+; RV32-NEXT: sw zero, 192(sp)
+; RV32-NEXT: sw s4, 196(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s4
+; RV32-NEXT: sw zero, 184(sp)
+; RV32-NEXT: sw s8, 188(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s8
+; RV32-NEXT: slli s5, s5, 11
+; RV32-NEXT: sw zero, 176(sp)
+; RV32-NEXT: sw s5, 180(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s5
+; RV32-NEXT: addi s5, sp, 216
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, ra
+; RV32-NEXT: sw zero, 168(sp)
+; RV32-NEXT: sw ra, 172(sp)
+; RV32-NEXT: addi ra, sp, 208
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s11
+; RV32-NEXT: sw zero, 160(sp)
+; RV32-NEXT: sw s11, 164(sp)
+; RV32-NEXT: addi s11, sp, 200
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s9
+; RV32-NEXT: sw zero, 152(sp)
+; RV32-NEXT: sw s9, 156(sp)
+; RV32-NEXT: addi s9, sp, 192
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s6
+; RV32-NEXT: sw zero, 144(sp)
+; RV32-NEXT: sw s6, 148(sp)
+; RV32-NEXT: addi s6, sp, 184
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s2
+; RV32-NEXT: sw zero, 136(sp)
+; RV32-NEXT: sw s2, 140(sp)
+; RV32-NEXT: addi s3, sp, 176
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, s0
+; RV32-NEXT: sw zero, 128(sp)
+; RV32-NEXT: sw s0, 132(sp)
+; RV32-NEXT: addi s4, sp, 168
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, t5
+; RV32-NEXT: sw zero, 120(sp)
+; RV32-NEXT: sw t5, 124(sp)
+; RV32-NEXT: addi s2, sp, 160
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, t4
+; RV32-NEXT: sw zero, 112(sp)
+; RV32-NEXT: sw t4, 116(sp)
+; RV32-NEXT: addi s1, sp, 152
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, t3
+; RV32-NEXT: sw zero, 104(sp)
+; RV32-NEXT: sw t3, 108(sp)
+; RV32-NEXT: addi t6, sp, 144
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, t2
+; RV32-NEXT: sw zero, 96(sp)
+; RV32-NEXT: sw t2, 100(sp)
+; RV32-NEXT: addi s0, sp, 136
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, t1
+; RV32-NEXT: sw zero, 88(sp)
+; RV32-NEXT: sw t1, 92(sp)
+; RV32-NEXT: addi t5, sp, 128
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, t0
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw t0, 84(sp)
+; RV32-NEXT: addi t4, sp, 120
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, a7
+; RV32-NEXT: sw zero, 72(sp)
+; RV32-NEXT: sw a7, 76(sp)
+; RV32-NEXT: addi t2, sp, 112
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, a6
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw a6, 68(sp)
+; RV32-NEXT: addi t3, sp, 104
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, a5
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw a5, 60(sp)
+; RV32-NEXT: addi t1, sp, 96
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, a4
+; RV32-NEXT: sw zero, 48(sp)
+; RV32-NEXT: sw a4, 52(sp)
+; RV32-NEXT: addi t0, sp, 88
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, a3
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw a3, 44(sp)
+; RV32-NEXT: addi a7, sp, 80
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: vand.vx v0, v16, a2
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: lui a0, 262144
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v24, v24, v0
+; RV32-NEXT: sw t2, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (s10), zero
+; RV32-NEXT: addi a6, sp, 72
+; RV32-NEXT: addi a5, sp, 64
+; RV32-NEXT: addi a4, sp, 56
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 8
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: addi a1, sp, 48
+; RV32-NEXT: addi s10, sp, 40
+; RV32-NEXT: addi a3, sp, 32
+; RV32-NEXT: addi a2, sp, 24
+; RV32-NEXT: addi s7, sp, 264
+; RV32-NEXT: vlse64.v v24, (s7), zero
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 4
+; RV32-NEXT: mv s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, s7
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: addi s7, sp, 256
+; RV32-NEXT: vlse64.v v0, (s7), zero
+; RV32-NEXT: addi s7, sp, 248
+; RV32-NEXT: vlse64.v v24, (s7), zero
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: mv s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, s7
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: addi s7, sp, 240
+; RV32-NEXT: vlse64.v v24, (s7), zero
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: mv s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 2
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, s7
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 4
+; RV32-NEXT: mv s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, s7
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 5
+; RV32-NEXT: mv s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, s7
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: mv s7, t2
+; RV32-NEXT: slli t2, t2, 2
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, s7
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: mv s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, s7
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 4
+; RV32-NEXT: mv s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, s7
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: mv s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 2
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, s7
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: mv s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add s7, s7, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, s7
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: addi s7, sp, 16
+; RV32-NEXT: addi s8, sp, 232
+; RV32-NEXT: vlse64.v v24, (s8), zero
+; RV32-NEXT: csrr s8, vlenb
+; RV32-NEXT: slli s8, s8, 4
+; RV32-NEXT: mv t2, s8
+; RV32-NEXT: slli s8, s8, 2
+; RV32-NEXT: add t2, t2, s8
+; RV32-NEXT: slli s8, s8, 1
+; RV32-NEXT: add s8, s8, t2
+; RV32-NEXT: lw t2, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT: add s8, sp, s8
+; RV32-NEXT: addi s8, s8, 288
+; RV32-NEXT: vs8r.v v24, (s8) # vscale x 64-byte Folded Spill
+; RV32-NEXT: addi s8, sp, 224
+; RV32-NEXT: vlse64.v v0, (s8), zero
+; RV32-NEXT: vlse64.v v24, (s5), zero
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: mv s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 2
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s5, s5, s8
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (ra), zero
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: mv s8, s5
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s5, s5, s8
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 4
+; RV32-NEXT: mv s8, s5
+; RV32-NEXT: slli s5, s5, 2
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s5, s5, s8
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vl8r.v v24, (s5) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: mv s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s5, s5, s8
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 4
+; RV32-NEXT: mv s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s5, s5, s8
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: mv s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 2
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s5, s5, s8
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vl8r.v v24, (s5) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: mv s8, s5
+; RV32-NEXT: slli s5, s5, 2
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 2
+; RV32-NEXT: add s5, s5, s8
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: mv s8, s5
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s5, s5, s8
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vl8r.v v24, (s5) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: mv s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 2
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s5, s5, s8
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v0, (s11), zero
+; RV32-NEXT: vlse64.v v24, (s9), zero
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: mv s8, s5
+; RV32-NEXT: slli s5, s5, 3
+; RV32-NEXT: add s8, s8, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s5, s5, s8
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (s6), zero
+; RV32-NEXT: csrr s5, vlenb
+; RV32-NEXT: slli s5, s5, 4
+; RV32-NEXT: mv s6, s5
+; RV32-NEXT: slli s5, s5, 2
+; RV32-NEXT: add s6, s6, s5
+; RV32-NEXT: slli s5, s5, 1
+; RV32-NEXT: add s5, s5, s6
+; RV32-NEXT: add s5, sp, s5
+; RV32-NEXT: addi s5, s5, 288
+; RV32-NEXT: vs8r.v v24, (s5) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (s3), zero
+; RV32-NEXT: csrr s3, vlenb
+; RV32-NEXT: slli s3, s3, 6
+; RV32-NEXT: mv s5, s3
+; RV32-NEXT: slli s3, s3, 1
+; RV32-NEXT: add s3, s3, s5
+; RV32-NEXT: add s3, sp, s3
+; RV32-NEXT: addi s3, s3, 288
+; RV32-NEXT: vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vand.vv v0, v16, v0
+; RV32-NEXT: csrr s3, vlenb
+; RV32-NEXT: slli s3, s3, 4
+; RV32-NEXT: mv s5, s3
+; RV32-NEXT: slli s3, s3, 1
+; RV32-NEXT: add s3, s3, s5
+; RV32-NEXT: add s3, sp, s3
+; RV32-NEXT: addi s3, s3, 288
+; RV32-NEXT: vs8r.v v0, (s3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr s3, vlenb
+; RV32-NEXT: slli s3, s3, 3
+; RV32-NEXT: mv s5, s3
+; RV32-NEXT: slli s3, s3, 3
+; RV32-NEXT: add s5, s5, s3
+; RV32-NEXT: slli s3, s3, 1
+; RV32-NEXT: add s3, s3, s5
+; RV32-NEXT: add s3, sp, s3
+; RV32-NEXT: addi s3, s3, 288
+; RV32-NEXT: vl8r.v v24, (s3) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr s3, vlenb
+; RV32-NEXT: slli s3, s3, 3
+; RV32-NEXT: mv s5, s3
+; RV32-NEXT: slli s3, s3, 2
+; RV32-NEXT: add s5, s5, s3
+; RV32-NEXT: slli s3, s3, 1
+; RV32-NEXT: add s3, s3, s5
+; RV32-NEXT: add s3, sp, s3
+; RV32-NEXT: addi s3, s3, 288
+; RV32-NEXT: vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr s3, vlenb
+; RV32-NEXT: slli s3, s3, 4
+; RV32-NEXT: mv s5, s3
+; RV32-NEXT: slli s3, s3, 2
+; RV32-NEXT: add s5, s5, s3
+; RV32-NEXT: slli s3, s3, 1
+; RV32-NEXT: add s3, s3, s5
+; RV32-NEXT: add s3, sp, s3
+; RV32-NEXT: addi s3, s3, 288
+; RV32-NEXT: vl8r.v v24, (s3) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr s3, vlenb
+; RV32-NEXT: slli s3, s3, 5
+; RV32-NEXT: mv s5, s3
+; RV32-NEXT: slli s3, s3, 2
+; RV32-NEXT: add s3, s3, s5
+; RV32-NEXT: add s3, sp, s3
+; RV32-NEXT: addi s3, s3, 288
+; RV32-NEXT: vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr s3, vlenb
+; RV32-NEXT: slli s3, s3, 6
+; RV32-NEXT: mv s5, s3
+; RV32-NEXT: slli s3, s3, 1
+; RV32-NEXT: add s3, s3, s5
+; RV32-NEXT: add s3, sp, s3
+; RV32-NEXT: addi s3, s3, 288
+; RV32-NEXT: vl8r.v v24, (s3) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr s3, vlenb
+; RV32-NEXT: slli s3, s3, 4
+; RV32-NEXT: mv s5, s3
+; RV32-NEXT: slli s3, s3, 2
+; RV32-NEXT: add s5, s5, s3
+; RV32-NEXT: slli s3, s3, 1
+; RV32-NEXT: add s3, s3, s5
+; RV32-NEXT: add s3, sp, s3
+; RV32-NEXT: addi s3, s3, 288
+; RV32-NEXT: vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (s4), zero
+; RV32-NEXT: csrr s3, vlenb
+; RV32-NEXT: slli s3, s3, 6
+; RV32-NEXT: mv s4, s3
+; RV32-NEXT: slli s3, s3, 1
+; RV32-NEXT: add s3, s3, s4
+; RV32-NEXT: add s3, sp, s3
+; RV32-NEXT: addi s3, s3, 288
+; RV32-NEXT: vs8r.v v24, (s3) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v0, (s2), zero
+; RV32-NEXT: vlse64.v v24, (s1), zero
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: slli s1, s1, 3
+; RV32-NEXT: mv s2, s1
+; RV32-NEXT: slli s1, s1, 3
+; RV32-NEXT: add s2, s2, s1
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: add s1, s1, s2
+; RV32-NEXT: add s1, sp, s1
+; RV32-NEXT: addi s1, s1, 288
+; RV32-NEXT: vs8r.v v24, (s1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (t6), zero
+; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: slli t6, t6, 3
+; RV32-NEXT: mv s1, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add s1, s1, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add s1, s1, t6
+; RV32-NEXT: slli t6, t6, 2
+; RV32-NEXT: add t6, t6, s1
+; RV32-NEXT: add t6, sp, t6
+; RV32-NEXT: addi t6, t6, 288
+; RV32-NEXT: vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: slli t6, t6, 6
+; RV32-NEXT: mv s1, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add t6, t6, s1
+; RV32-NEXT: add t6, sp, t6
+; RV32-NEXT: addi t6, t6, 288
+; RV32-NEXT: vl8r.v v24, (t6) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: slli t6, t6, 3
+; RV32-NEXT: mv s1, t6
+; RV32-NEXT: slli t6, t6, 2
+; RV32-NEXT: add t6, t6, s1
+; RV32-NEXT: add t6, sp, t6
+; RV32-NEXT: addi t6, t6, 288
+; RV32-NEXT: vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: slli t6, t6, 5
+; RV32-NEXT: mv s1, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add t6, t6, s1
+; RV32-NEXT: add t6, sp, t6
+; RV32-NEXT: addi t6, t6, 288
+; RV32-NEXT: vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: slli t6, t6, 3
+; RV32-NEXT: mv s1, t6
+; RV32-NEXT: slli t6, t6, 3
+; RV32-NEXT: add s1, s1, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add t6, t6, s1
+; RV32-NEXT: add t6, sp, t6
+; RV32-NEXT: addi t6, t6, 288
+; RV32-NEXT: vl8r.v v24, (t6) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: slli t6, t6, 3
+; RV32-NEXT: mv s1, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add s1, s1, t6
+; RV32-NEXT: slli t6, t6, 3
+; RV32-NEXT: add t6, t6, s1
+; RV32-NEXT: add t6, sp, t6
+; RV32-NEXT: addi t6, t6, 288
+; RV32-NEXT: vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: slli t6, t6, 3
+; RV32-NEXT: mv s1, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add s1, s1, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add s1, s1, t6
+; RV32-NEXT: slli t6, t6, 2
+; RV32-NEXT: add t6, t6, s1
+; RV32-NEXT: add t6, sp, t6
+; RV32-NEXT: addi t6, t6, 288
+; RV32-NEXT: vl8r.v v24, (t6) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: slli t6, t6, 3
+; RV32-NEXT: mv s1, t6
+; RV32-NEXT: slli t6, t6, 3
+; RV32-NEXT: add s1, s1, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add t6, t6, s1
+; RV32-NEXT: add t6, sp, t6
+; RV32-NEXT: addi t6, t6, 288
+; RV32-NEXT: vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (s0), zero
+; RV32-NEXT: csrr t6, vlenb
+; RV32-NEXT: slli t6, t6, 3
+; RV32-NEXT: mv s0, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add s0, s0, t6
+; RV32-NEXT: slli t6, t6, 1
+; RV32-NEXT: add s0, s0, t6
+; RV32-NEXT: slli t6, t6, 2
+; RV32-NEXT: add t6, t6, s0
+; RV32-NEXT: add t6, sp, t6
+; RV32-NEXT: addi t6, t6, 288
+; RV32-NEXT: vs8r.v v24, (t6) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v0, (t5), zero
+; RV32-NEXT: vlse64.v v24, (t4), zero
+; RV32-NEXT: csrr t4, vlenb
+; RV32-NEXT: slli t4, t4, 6
+; RV32-NEXT: mv t5, t4
+; RV32-NEXT: slli t4, t4, 1
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: add t4, sp, t4
+; RV32-NEXT: addi t4, t4, 288
+; RV32-NEXT: vs8r.v v24, (t4) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (t2), zero
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 4
+; RV32-NEXT: mv t4, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t4, t4, t2
+; RV32-NEXT: slli t2, t2, 2
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: mv t4, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t4, t4, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t4, t4, t2
+; RV32-NEXT: slli t2, t2, 2
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 5
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: mv t4, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t4, t4, t2
+; RV32-NEXT: slli t2, t2, 2
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 6
+; RV32-NEXT: mv t4, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 4
+; RV32-NEXT: mv t4, t2
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 4
+; RV32-NEXT: mv t4, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t4, t4, t2
+; RV32-NEXT: slli t2, t2, 2
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vl8r.v v24, (t2) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 6
+; RV32-NEXT: mv t4, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, t4
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (t3), zero
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 4
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t3, t3, t2
+; RV32-NEXT: slli t2, t2, 2
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs8r.v v24, (t2) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v0, (t1), zero
+; RV32-NEXT: vlse64.v v24, (t0), zero
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: slli t0, t0, 3
+; RV32-NEXT: mv t1, t0
+; RV32-NEXT: slli t0, t0, 1
+; RV32-NEXT: add t1, t1, t0
+; RV32-NEXT: slli t0, t0, 1
+; RV32-NEXT: add t1, t1, t0
+; RV32-NEXT: slli t0, t0, 2
+; RV32-NEXT: add t0, t0, t1
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 288
+; RV32-NEXT: vs8r.v v24, (t0) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (a7), zero
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 7
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 4
+; RV32-NEXT: mv t0, a7
+; RV32-NEXT: slli a7, a7, 1
+; RV32-NEXT: add t0, t0, a7
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 3
+; RV32-NEXT: mv t0, a7
+; RV32-NEXT: slli a7, a7, 1
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 4
+; RV32-NEXT: mv t0, a7
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 3
+; RV32-NEXT: mv t0, a7
+; RV32-NEXT: slli a7, a7, 1
+; RV32-NEXT: add t0, t0, a7
+; RV32-NEXT: slli a7, a7, 1
+; RV32-NEXT: add t0, t0, a7
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 3
+; RV32-NEXT: mv t0, a7
+; RV32-NEXT: slli a7, a7, 4
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 7
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vl8r.v v24, (a7) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 3
+; RV32-NEXT: mv t0, a7
+; RV32-NEXT: slli a7, a7, 1
+; RV32-NEXT: add t0, t0, a7
+; RV32-NEXT: slli a7, a7, 1
+; RV32-NEXT: add t0, t0, a7
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs8r.v v24, (a7) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (a6), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 7
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs8r.v v24, (a6) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v0, (a5), zero
+; RV32-NEXT: vlse64.v v24, (a4), zero
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 4
+; RV32-NEXT: mv a5, a4
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: add a5, a5, a4
+; RV32-NEXT: slli a4, a4, 2
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 288
+; RV32-NEXT: vs8r.v v24, (a4) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (a1), zero
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a4, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a4, a4, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a4, a4, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 7
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a4, a1
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, a1, a4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: mv a4, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a4, a4, a1
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, a1, a4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 7
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a4, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a4, a4, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a4, a4, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 4
+; RV32-NEXT: mv a4, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a4, a4, a1
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, a1, a4
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (s10), zero
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v0, (a3), zero
+; RV32-NEXT: vlse64.v v24, (a2), zero
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (s7), zero
+; RV32-NEXT: addi a1, sp, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: vand.vv v24, v16, v0
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 6
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v16, v24
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 3
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs8r.v v24, (a1) # vscale x 64-byte Folded Spill
+; RV32-NEXT: addi a1, sp, 288
+; RV32-NEXT: vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vand.vv v0, v16, v24
+; RV32-NEXT: vand.vx v16, v16, a0
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vxor.vv v16, v24, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 8
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 6
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 6
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: vmul.vv v8, v8, v0
+; RV32-NEXT: vxor.vv v8, v16, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 348(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 344(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 340(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 336(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 332(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 328(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 324(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 320(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 316(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 312(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 308(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 304(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 300(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 352
+; RV32-NEXT: ret
+;
+; RV64-LABEL: clmul_nxv8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma
+; RV64-NEXT: vand.vi v24, v16, 2
+; RV64-NEXT: vand.vi v0, v16, 1
+; RV64-NEXT: vmul.vv v24, v8, v24
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v0, v24
+; RV64-NEXT: vand.vi v0, v16, 4
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vi v0, v16, 8
+; RV64-NEXT: li a0, 16
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a0
+; RV64-NEXT: li a0, 64
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a0
+; RV64-NEXT: li a0, 256
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: li a1, 512
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a0
+; RV64-NEXT: li a2, 1024
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a2
+; RV64-NEXT: slli a1, a0, 11
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 2
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 16
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 32
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 64
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 128
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 256
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 512
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 1024
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 2048
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 4096
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 8192
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 16384
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 32768
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 65536
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 131072
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: lui a1, 262144
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 31
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 32
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 33
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 34
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 35
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 36
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 37
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 38
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 39
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 40
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 41
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 42
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 43
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 44
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 45
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 46
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 47
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 49
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 50
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 51
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 52
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 53
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 54
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 55
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 56
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 57
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 58
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 59
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 60
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: slli a1, a0, 61
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a1
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: slli a0, a0, 62
+; RV64-NEXT: slli a1, a1, 63
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vand.vx v0, v16, a0
+; RV64-NEXT: vand.vx v16, v16, a1
+; RV64-NEXT: vmul.vv v0, v8, v0
+; RV64-NEXT: vxor.vv v24, v24, v0
+; RV64-NEXT: vmul.vv v8, v8, v16
+; RV64-NEXT: vxor.vv v8, v24, v8
+; RV64-NEXT: ret
+ %a = call <vscale x 8 x i64> @llvm.clmul.nxv8i64(<vscale x 8 x i64> %x, <vscale x 8 x i64> %y)
+ ret <vscale x 8 x i64> %a
+}
+
+define <vscale x 4 x i8> @clmul_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) nounwind {
+; CHECK-LABEL: clmul_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vand.vi v10, v9, 2
+; CHECK-NEXT: vand.vi v11, v9, 1
+; CHECK-NEXT: vmul.vv v10, v8, v10
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v11, v10
+; CHECK-NEXT: vand.vi v11, v9, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vi v11, v9, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vand.vx v9, v9, a0
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vmul.vv v8, v8, v9
+; CHECK-NEXT: vxor.vv v8, v10, v8
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x i8> @llvm.clmul.nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b)
+ ret <vscale x 4 x i8> %res
+}
+
+define <vscale x 4 x i16> @clmul_nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b) nounwind {
+; CHECK-LABEL: clmul_nxv4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vand.vi v10, v9, 2
+; CHECK-NEXT: vand.vi v11, v9, 1
+; CHECK-NEXT: vmul.vv v10, v8, v10
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v11, v10
+; CHECK-NEXT: vand.vi v11, v9, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vi v11, v9, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vand.vx v9, v9, a0
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vmul.vv v8, v8, v9
+; CHECK-NEXT: vxor.vv v8, v10, v8
+; CHECK-NEXT: ret
+ %res = call <vscale x 4 x i16> @llvm.clmul.nxv4i16(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b)
+ ret <vscale x 4 x i16> %res
+}
+
+define <vscale x 4 x i8> @clmulr_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) nounwind {
+; CHECK-LABEL: clmulr_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vzext.vf2 v10, v8
+; CHECK-NEXT: vzext.vf2 v8, v9
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vand.vi v9, v8, 2
+; CHECK-NEXT: vand.vi v11, v8, 1
+; CHECK-NEXT: vmul.vv v9, v10, v9
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v11, v9
+; CHECK-NEXT: vand.vi v11, v8, 4
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vi v11, v8, 8
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vand.vx v8, v8, a0
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vmul.vv v8, v10, v8
+; CHECK-NEXT: vxor.vv v8, v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 7
+; CHECK-NEXT: ret
+ %a.ext = zext <vscale x 4 x i8> %a to <vscale x 4 x i16>
+ %b.ext = zext <vscale x 4 x i8> %b to <vscale x 4 x i16>
+ %clmul = call <vscale x 4 x i16> @llvm.clmul.nxv4i8(<vscale x 4 x i16> %a.ext, <vscale x 4 x i16> %b.ext)
+ %res.ext = lshr <vscale x 4 x i16> %clmul, splat(i16 7)
+ %res = trunc <vscale x 4 x i16> %res.ext to <vscale x 4 x i8>
+ ret <vscale x 4 x i8> %res
+}
+
+define <vscale x 4 x i8> @clmulh_nxv4i8(<vscale x 4 x i8> %a, <vscale x 4 x i8> %b) nounwind {
+; CHECK-LABEL: clmulh_nxv4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vzext.vf2 v10, v8
+; CHECK-NEXT: vzext.vf2 v8, v9
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vand.vi v9, v8, 2
+; CHECK-NEXT: vand.vi v11, v8, 1
+; CHECK-NEXT: vmul.vv v9, v10, v9
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v11, v9
+; CHECK-NEXT: vand.vi v11, v8, 4
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vi v11, v8, 8
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vand.vx v8, v8, a0
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vmul.vv v8, v10, v8
+; CHECK-NEXT: vxor.vv v8, v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 8
+; CHECK-NEXT: ret
+ %a.ext = zext <vscale x 4 x i8> %a to <vscale x 4 x i16>
+ %b.ext = zext <vscale x 4 x i8> %b to <vscale x 4 x i16>
+ %clmul = call <vscale x 4 x i16> @llvm.clmul.nxv4i8(<vscale x 4 x i16> %a.ext, <vscale x 4 x i16> %b.ext)
+ %res.ext = lshr <vscale x 4 x i16> %clmul, splat(i16 8)
+ %res = trunc <vscale x 4 x i16> %res.ext to <vscale x 4 x i8>
+ ret <vscale x 4 x i8> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-clmul.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-clmul.ll
new file mode 100644
index 0000000000000..0c9e96e2a1694
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-clmul.ll
@@ -0,0 +1,4888 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -verify-machineinstrs -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs -mattr=+v < %s | FileCheck %s --check-prefixes=CHECK,RV64
+
+define <1 x i32> @clmul_v1i32(<1 x i32> %x, <1 x i32> %y) nounwind {
+; CHECK-LABEL: clmul_v1i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma
+; CHECK-NEXT: vand.vi v10, v9, 2
+; CHECK-NEXT: vand.vi v11, v9, 1
+; CHECK-NEXT: vmul.vv v10, v8, v10
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v11, v10
+; CHECK-NEXT: vand.vi v11, v9, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vi v11, v9, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2048
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4096
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8192
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 16384
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 32768
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 65536
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 131072
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 262144
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vand.vx v9, v9, a0
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vmul.vv v8, v8, v9
+; CHECK-NEXT: vxor.vv v8, v10, v8
+; CHECK-NEXT: ret
+ %a = call <1 x i32> @llvm.clmul.v1i32(<1 x i32> %x, <1 x i32> %y)
+ ret <1 x i32> %a
+}
+
+define <2 x i32> @clmul_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
+; CHECK-LABEL: clmul_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma
+; CHECK-NEXT: vand.vi v10, v9, 2
+; CHECK-NEXT: vand.vi v11, v9, 1
+; CHECK-NEXT: vmul.vv v10, v8, v10
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v11, v10
+; CHECK-NEXT: vand.vi v11, v9, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vi v11, v9, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2048
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4096
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8192
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 16384
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 32768
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 65536
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 131072
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 262144
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vand.vx v9, v9, a0
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vmul.vv v8, v8, v9
+; CHECK-NEXT: vxor.vv v8, v10, v8
+; CHECK-NEXT: ret
+ %a = call <2 x i32> @llvm.clmul.v2i32(<2 x i32> %x, <2 x i32> %y)
+ ret <2 x i32> %a
+}
+
+define <4 x i32> @clmul_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+; CHECK-LABEL: clmul_v4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vand.vi v10, v9, 2
+; CHECK-NEXT: vand.vi v11, v9, 1
+; CHECK-NEXT: vmul.vv v10, v8, v10
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v11, v10
+; CHECK-NEXT: vand.vi v11, v9, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vi v11, v9, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2048
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4096
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8192
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 16384
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 32768
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 65536
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 131072
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 262144
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vand.vx v9, v9, a0
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vmul.vv v8, v8, v9
+; CHECK-NEXT: vxor.vv v8, v10, v8
+; CHECK-NEXT: ret
+ %a = call <4 x i32> @llvm.clmul.v4i32(<4 x i32> %x, <4 x i32> %y)
+ ret <4 x i32> %a
+}
+
+define <8 x i32> @clmul_v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
+; CHECK-LABEL: clmul_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vand.vi v10, v8, 2
+; CHECK-NEXT: vand.vi v12, v8, 1
+; CHECK-NEXT: vmul.vv v10, v8, v10
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v12, v10
+; CHECK-NEXT: vand.vi v12, v8, 4
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vi v12, v8, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 16
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 32
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 64
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 128
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 256
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 512
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 1024
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 2048
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 4096
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 8192
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 16384
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 32768
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 65536
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 131072
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 262144
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vmul.vv v12, v8, v12
+; CHECK-NEXT: vxor.vv v10, v10, v12
+; CHECK-NEXT: vand.vx v12, v8, a0
+; CHECK-NEXT: vmul.vv v8, v8, v12
+; CHECK-NEXT: vxor.vv v8, v10, v8
+; CHECK-NEXT: ret
+ %a = call <8 x i32> @llvm.clmul.v8i32(<8 x i32> %x, <8 x i32> %x)
+ ret <8 x i32> %a
+}
+
+define <16 x i32> @clmul_v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
+; CHECK-LABEL: clmul_v16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma
+; CHECK-NEXT: vand.vi v16, v12, 2
+; CHECK-NEXT: vand.vi v20, v12, 1
+; CHECK-NEXT: vmul.vv v16, v8, v16
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v20, v16
+; CHECK-NEXT: vand.vi v20, v12, 4
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vi v20, v12, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 16
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 32
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 64
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 128
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 256
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 512
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 1024
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 2048
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 4096
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 8192
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 16384
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 32768
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 65536
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 131072
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 262144
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vand.vx v20, v12, a0
+; CHECK-NEXT: lui a0, 524288
+; CHECK-NEXT: vand.vx v12, v12, a0
+; CHECK-NEXT: vmul.vv v20, v8, v20
+; CHECK-NEXT: vxor.vv v16, v16, v20
+; CHECK-NEXT: vmul.vv v8, v8, v12
+; CHECK-NEXT: vxor.vv v8, v16, v8
+; CHECK-NEXT: ret
+ %a = call <16 x i32> @llvm.clmul.v16i32(<16 x i32> %x, <16 x i32> %y)
+ ret <16 x i32> %a
+}
+
+define <1 x i64> @clmul_v1i64(<1 x i64> %x, <1 x i64> %y) nounwind {
+; RV32-LABEL: clmul_v1i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -352
+; RV32-NEXT: sw ra, 348(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 344(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 340(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 336(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 332(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 328(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 324(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 320(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 316(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 312(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 308(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 304(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 300(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 3
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: lui a1, 524288
+; RV32-NEXT: li t5, 1
+; RV32-NEXT: li a4, 2
+; RV32-NEXT: li a2, 4
+; RV32-NEXT: li s11, 8
+; RV32-NEXT: li a0, 16
+; RV32-NEXT: li ra, 32
+; RV32-NEXT: li s10, 64
+; RV32-NEXT: li s9, 128
+; RV32-NEXT: li s8, 256
+; RV32-NEXT: li s7, 512
+; RV32-NEXT: li s1, 1024
+; RV32-NEXT: lui s6, 1
+; RV32-NEXT: lui s5, 2
+; RV32-NEXT: lui s4, 4
+; RV32-NEXT: lui s3, 8
+; RV32-NEXT: lui s2, 16
+; RV32-NEXT: lui s0, 32
+; RV32-NEXT: lui t6, 64
+; RV32-NEXT: lui t4, 128
+; RV32-NEXT: lui t3, 256
+; RV32-NEXT: lui t2, 512
+; RV32-NEXT: lui t1, 1024
+; RV32-NEXT: lui t0, 2048
+; RV32-NEXT: lui a7, 4096
+; RV32-NEXT: lui a6, 8192
+; RV32-NEXT: lui a5, 16384
+; RV32-NEXT: lui a3, 32768
+; RV32-NEXT: sw a1, 272(sp)
+; RV32-NEXT: sw zero, 276(sp)
+; RV32-NEXT: sw zero, 264(sp)
+; RV32-NEXT: sw t5, 268(sp)
+; RV32-NEXT: sw zero, 256(sp)
+; RV32-NEXT: sw a4, 260(sp)
+; RV32-NEXT: lui a4, 65536
+; RV32-NEXT: sw zero, 248(sp)
+; RV32-NEXT: sw a2, 252(sp)
+; RV32-NEXT: lui a2, 131072
+; RV32-NEXT: sw zero, 240(sp)
+; RV32-NEXT: sw s11, 244(sp)
+; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV32-NEXT: vand.vi v13, v9, 2
+; RV32-NEXT: vand.vi v14, v9, 1
+; RV32-NEXT: vand.vi v12, v9, 4
+; RV32-NEXT: vand.vi v11, v9, 8
+; RV32-NEXT: sw zero, 232(sp)
+; RV32-NEXT: sw a0, 236(sp)
+; RV32-NEXT: vand.vx v10, v9, a0
+; RV32-NEXT: addi s11, sp, 272
+; RV32-NEXT: sw zero, 224(sp)
+; RV32-NEXT: sw ra, 228(sp)
+; RV32-NEXT: vand.vx v15, v9, ra
+; RV32-NEXT: addi ra, sp, 264
+; RV32-NEXT: sw zero, 216(sp)
+; RV32-NEXT: sw s10, 220(sp)
+; RV32-NEXT: vand.vx v16, v9, s10
+; RV32-NEXT: addi s10, sp, 256
+; RV32-NEXT: sw zero, 208(sp)
+; RV32-NEXT: sw s9, 212(sp)
+; RV32-NEXT: vand.vx v17, v9, s9
+; RV32-NEXT: addi s9, sp, 248
+; RV32-NEXT: sw zero, 200(sp)
+; RV32-NEXT: sw s8, 204(sp)
+; RV32-NEXT: vand.vx v18, v9, s8
+; RV32-NEXT: addi s8, sp, 240
+; RV32-NEXT: sw zero, 192(sp)
+; RV32-NEXT: sw s7, 196(sp)
+; RV32-NEXT: vand.vx v19, v9, s7
+; RV32-NEXT: addi s7, sp, 232
+; RV32-NEXT: sw zero, 184(sp)
+; RV32-NEXT: sw s1, 188(sp)
+; RV32-NEXT: vand.vx v20, v9, s1
+; RV32-NEXT: slli t5, t5, 11
+; RV32-NEXT: vand.vx v21, v9, s6
+; RV32-NEXT: sw zero, 176(sp)
+; RV32-NEXT: sw t5, 180(sp)
+; RV32-NEXT: sw zero, 168(sp)
+; RV32-NEXT: sw s6, 172(sp)
+; RV32-NEXT: addi s6, sp, 216
+; RV32-NEXT: vand.vx v22, v9, s5
+; RV32-NEXT: sw zero, 160(sp)
+; RV32-NEXT: sw s5, 164(sp)
+; RV32-NEXT: addi s5, sp, 208
+; RV32-NEXT: vand.vx v23, v9, s4
+; RV32-NEXT: sw zero, 152(sp)
+; RV32-NEXT: sw s4, 156(sp)
+; RV32-NEXT: addi s4, sp, 200
+; RV32-NEXT: vand.vx v24, v9, s3
+; RV32-NEXT: sw zero, 144(sp)
+; RV32-NEXT: sw s3, 148(sp)
+; RV32-NEXT: addi s3, sp, 192
+; RV32-NEXT: vand.vx v25, v9, s2
+; RV32-NEXT: sw zero, 136(sp)
+; RV32-NEXT: sw s2, 140(sp)
+; RV32-NEXT: addi s2, sp, 184
+; RV32-NEXT: vand.vx v26, v9, s0
+; RV32-NEXT: sw zero, 128(sp)
+; RV32-NEXT: sw s0, 132(sp)
+; RV32-NEXT: addi s1, sp, 176
+; RV32-NEXT: vand.vx v27, v9, t6
+; RV32-NEXT: sw zero, 120(sp)
+; RV32-NEXT: sw t6, 124(sp)
+; RV32-NEXT: addi s0, sp, 168
+; RV32-NEXT: vand.vx v28, v9, t4
+; RV32-NEXT: sw zero, 112(sp)
+; RV32-NEXT: sw t4, 116(sp)
+; RV32-NEXT: addi t6, sp, 160
+; RV32-NEXT: vand.vx v29, v9, t3
+; RV32-NEXT: sw zero, 104(sp)
+; RV32-NEXT: sw t3, 108(sp)
+; RV32-NEXT: addi t4, sp, 152
+; RV32-NEXT: vand.vx v30, v9, t2
+; RV32-NEXT: sw zero, 96(sp)
+; RV32-NEXT: sw t2, 100(sp)
+; RV32-NEXT: addi t3, sp, 144
+; RV32-NEXT: vand.vx v31, v9, t1
+; RV32-NEXT: sw zero, 88(sp)
+; RV32-NEXT: sw t1, 92(sp)
+; RV32-NEXT: addi t2, sp, 136
+; RV32-NEXT: vand.vx v7, v9, t0
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw t0, 84(sp)
+; RV32-NEXT: addi t1, sp, 128
+; RV32-NEXT: vand.vx v6, v9, a7
+; RV32-NEXT: sw zero, 72(sp)
+; RV32-NEXT: sw a7, 76(sp)
+; RV32-NEXT: addi t0, sp, 120
+; RV32-NEXT: vand.vx v5, v9, a6
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw a6, 68(sp)
+; RV32-NEXT: addi a7, sp, 112
+; RV32-NEXT: vand.vx v4, v9, a5
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw a5, 60(sp)
+; RV32-NEXT: addi a6, sp, 104
+; RV32-NEXT: vand.vx v3, v9, a3
+; RV32-NEXT: sw zero, 48(sp)
+; RV32-NEXT: sw a3, 52(sp)
+; RV32-NEXT: addi a5, sp, 96
+; RV32-NEXT: vand.vx v2, v9, a4
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw a4, 44(sp)
+; RV32-NEXT: addi a4, sp, 88
+; RV32-NEXT: vand.vx v1, v9, a2
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: addi a3, sp, 80
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: lui a0, 262144
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: addi a2, sp, 72
+; RV32-NEXT: vand.vx v0, v9, t5
+; RV32-NEXT: addi a1, sp, 64
+; RV32-NEXT: vmul.vv v13, v8, v13
+; RV32-NEXT: vmul.vv v14, v8, v14
+; RV32-NEXT: vxor.vi v14, v14, 0
+; RV32-NEXT: vxor.vv v14, v14, v13
+; RV32-NEXT: vlse64.v v13, (s11), zero
+; RV32-NEXT: addi s11, sp, 56
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v14, v14, v12
+; RV32-NEXT: vlse64.v v12, (ra), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: mv ra, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add t5, t5, ra
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs1r.v v12, (t5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi ra, sp, 48
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v14, v14, v11
+; RV32-NEXT: vlse64.v v11, (s10), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli s10, t5, 2
+; RV32-NEXT: add t5, s10, t5
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs1r.v v11, (t5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi s10, sp, 40
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v14, v14, v10
+; RV32-NEXT: vlse64.v v10, (s9), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs1r.v v10, (t5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi t5, sp, 32
+; RV32-NEXT: vmul.vv v15, v8, v15
+; RV32-NEXT: vxor.vv v15, v14, v15
+; RV32-NEXT: vlse64.v v10, (s8), zero
+; RV32-NEXT: csrr s8, vlenb
+; RV32-NEXT: slli s9, s8, 1
+; RV32-NEXT: add s8, s9, s8
+; RV32-NEXT: add s8, sp, s8
+; RV32-NEXT: addi s8, s8, 288
+; RV32-NEXT: vs1r.v v10, (s8) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi s8, sp, 24
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v16, v15, v16
+; RV32-NEXT: vlse64.v v10, (s7), zero
+; RV32-NEXT: csrr s7, vlenb
+; RV32-NEXT: slli s7, s7, 1
+; RV32-NEXT: add s7, sp, s7
+; RV32-NEXT: addi s7, s7, 288
+; RV32-NEXT: vs1r.v v10, (s7) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi s7, sp, 16
+; RV32-NEXT: vmul.vv v17, v8, v17
+; RV32-NEXT: vmul.vv v18, v8, v18
+; RV32-NEXT: vmul.vv v19, v8, v19
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vmul.vv v21, v8, v21
+; RV32-NEXT: vmul.vv v22, v8, v22
+; RV32-NEXT: vmul.vv v23, v8, v23
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v25, v8, v25
+; RV32-NEXT: vmul.vv v26, v8, v26
+; RV32-NEXT: vmul.vv v27, v8, v27
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vmul.vv v29, v8, v29
+; RV32-NEXT: vmul.vv v30, v8, v30
+; RV32-NEXT: vmul.vv v31, v8, v31
+; RV32-NEXT: vmul.vv v7, v8, v7
+; RV32-NEXT: vmul.vv v6, v8, v6
+; RV32-NEXT: vmul.vv v5, v8, v5
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vmul.vv v3, v8, v3
+; RV32-NEXT: vmul.vv v2, v8, v2
+; RV32-NEXT: vmul.vv v1, v8, v1
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v16, v16, v17
+; RV32-NEXT: addi s9, sp, 224
+; RV32-NEXT: vlse64.v v11, (s9), zero
+; RV32-NEXT: vxor.vv v16, v16, v18
+; RV32-NEXT: vlse64.v v10, (s6), zero
+; RV32-NEXT: csrr s6, vlenb
+; RV32-NEXT: add s6, sp, s6
+; RV32-NEXT: addi s6, s6, 288
+; RV32-NEXT: vs1r.v v10, (s6) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vxor.vv v16, v16, v19
+; RV32-NEXT: vlse64.v v10, (s5), zero
+; RV32-NEXT: addi s5, sp, 288
+; RV32-NEXT: vs1r.v v10, (s5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vxor.vv v16, v16, v20
+; RV32-NEXT: vlse64.v v12, (s4), zero
+; RV32-NEXT: vxor.vv v16, v16, v0
+; RV32-NEXT: vlse64.v v0, (s3), zero
+; RV32-NEXT: vxor.vv v16, v16, v21
+; RV32-NEXT: vlse64.v v21, (s2), zero
+; RV32-NEXT: vxor.vv v16, v16, v22
+; RV32-NEXT: vlse64.v v22, (s1), zero
+; RV32-NEXT: vxor.vv v16, v16, v23
+; RV32-NEXT: vlse64.v v23, (s0), zero
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: vlse64.v v24, (t6), zero
+; RV32-NEXT: vxor.vv v16, v16, v25
+; RV32-NEXT: vlse64.v v25, (t4), zero
+; RV32-NEXT: vxor.vv v16, v16, v26
+; RV32-NEXT: vlse64.v v26, (t3), zero
+; RV32-NEXT: vxor.vv v16, v16, v27
+; RV32-NEXT: vlse64.v v27, (t2), zero
+; RV32-NEXT: vxor.vv v16, v16, v28
+; RV32-NEXT: vlse64.v v28, (t1), zero
+; RV32-NEXT: vxor.vv v16, v16, v29
+; RV32-NEXT: vlse64.v v29, (t0), zero
+; RV32-NEXT: vxor.vv v16, v16, v30
+; RV32-NEXT: vlse64.v v30, (a7), zero
+; RV32-NEXT: vxor.vv v16, v16, v31
+; RV32-NEXT: vlse64.v v31, (a6), zero
+; RV32-NEXT: vxor.vv v16, v16, v7
+; RV32-NEXT: vlse64.v v7, (a5), zero
+; RV32-NEXT: vxor.vv v16, v16, v6
+; RV32-NEXT: vlse64.v v6, (a4), zero
+; RV32-NEXT: vxor.vv v16, v16, v5
+; RV32-NEXT: vlse64.v v5, (a3), zero
+; RV32-NEXT: vxor.vv v16, v16, v4
+; RV32-NEXT: vlse64.v v4, (a2), zero
+; RV32-NEXT: vxor.vv v16, v16, v3
+; RV32-NEXT: vlse64.v v3, (a1), zero
+; RV32-NEXT: vxor.vv v16, v16, v2
+; RV32-NEXT: vlse64.v v2, (s11), zero
+; RV32-NEXT: vxor.vv v1, v16, v1
+; RV32-NEXT: vlse64.v v10, (ra), zero
+; RV32-NEXT: vand.vv v13, v9, v13
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v14, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v14, v9, v14
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v15, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v15, v9, v15
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v16, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v9, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v17, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v17, v9, v17
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v18, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v18, v9, v18
+; RV32-NEXT: vand.vv v19, v9, v11
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v11, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v20, v9, v11
+; RV32-NEXT: addi a1, sp, 288
+; RV32-NEXT: vl1r.v v11, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v11, v9, v11
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v11, v9, v12
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v0, v9, v0
+; RV32-NEXT: vand.vv v21, v9, v21
+; RV32-NEXT: vand.vv v22, v9, v22
+; RV32-NEXT: vand.vv v23, v9, v23
+; RV32-NEXT: vand.vv v24, v9, v24
+; RV32-NEXT: vand.vv v25, v9, v25
+; RV32-NEXT: vand.vv v26, v9, v26
+; RV32-NEXT: vand.vv v27, v9, v27
+; RV32-NEXT: vand.vv v28, v9, v28
+; RV32-NEXT: vand.vv v29, v9, v29
+; RV32-NEXT: vand.vv v30, v9, v30
+; RV32-NEXT: vand.vv v31, v9, v31
+; RV32-NEXT: vand.vv v7, v9, v7
+; RV32-NEXT: vand.vv v6, v9, v6
+; RV32-NEXT: vand.vv v5, v9, v5
+; RV32-NEXT: vand.vv v4, v9, v4
+; RV32-NEXT: vand.vv v11, v9, v3
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v2, v9, v2
+; RV32-NEXT: vand.vv v10, v9, v10
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v10, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vlse64.v v10, (s10), zero
+; RV32-NEXT: vlse64.v v3, (t5), zero
+; RV32-NEXT: vlse64.v v11, (s8), zero
+; RV32-NEXT: vlse64.v v12, (s7), zero
+; RV32-NEXT: vand.vv v10, v9, v10
+; RV32-NEXT: vand.vv v3, v9, v3
+; RV32-NEXT: vand.vv v11, v9, v11
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v9, v12
+; RV32-NEXT: vand.vx v9, v9, a0
+; RV32-NEXT: vmul.vv v9, v8, v9
+; RV32-NEXT: vxor.vv v9, v1, v9
+; RV32-NEXT: vmul.vv v11, v8, v13
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v14
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v15
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v16
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v17
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v18
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v19
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v20
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 1
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v0
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v21
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v22
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v23
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v24
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v25
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v26
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v27
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v28
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v29
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v30
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v31
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v7
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v6
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v5
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v4
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v2
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v9, v9, v10
+; RV32-NEXT: vmul.vv v10, v8, v3
+; RV32-NEXT: vxor.vv v9, v9, v10
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 2
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v9, v9, v10
+; RV32-NEXT: vmul.vv v8, v8, v12
+; RV32-NEXT: vxor.vv v8, v9, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 3
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 348(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 344(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 340(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 336(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 332(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 328(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 324(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 320(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 316(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 312(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 308(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 304(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 300(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 352
+; RV32-NEXT: ret
+;
+; RV64-LABEL: clmul_v1i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma
+; RV64-NEXT: vand.vi v10, v9, 2
+; RV64-NEXT: vand.vi v11, v9, 1
+; RV64-NEXT: vmul.vv v10, v8, v10
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v11, v10
+; RV64-NEXT: vand.vi v11, v9, 4
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vi v11, v9, 8
+; RV64-NEXT: li a0, 16
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: li a0, 64
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: li a0, 256
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a1, 512
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: li a2, 1024
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a2
+; RV64-NEXT: slli a1, a0, 11
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 2
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 16
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 32
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 64
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 128
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 256
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 512
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 1024
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 2048
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 4096
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 8192
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 16384
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 32768
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 65536
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 131072
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 262144
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 31
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 32
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 33
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 34
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 35
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 36
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 37
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 38
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 39
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 40
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 41
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 42
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 43
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 44
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 45
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 46
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 47
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 49
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 50
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 51
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 52
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 53
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 54
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 55
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 56
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 57
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 58
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 59
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 60
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 61
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: slli a0, a0, 62
+; RV64-NEXT: slli a1, a1, 63
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vmul.vv v8, v8, v9
+; RV64-NEXT: vxor.vv v8, v10, v8
+; RV64-NEXT: ret
+ %a = call <1 x i64> @llvm.clmul.v1i64(<1 x i64> %x, <1 x i64> %y)
+ ret <1 x i64> %a
+}
+
+define <2 x i64> @clmul_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
+; RV32-LABEL: clmul_v2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -352
+; RV32-NEXT: sw ra, 348(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 344(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 340(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 336(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 332(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 328(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 324(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 320(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 316(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 312(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 308(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 304(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 300(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 3
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: lui a1, 524288
+; RV32-NEXT: li t5, 1
+; RV32-NEXT: li a4, 2
+; RV32-NEXT: li a2, 4
+; RV32-NEXT: li s11, 8
+; RV32-NEXT: li a0, 16
+; RV32-NEXT: li ra, 32
+; RV32-NEXT: li s10, 64
+; RV32-NEXT: li s9, 128
+; RV32-NEXT: li s8, 256
+; RV32-NEXT: li s7, 512
+; RV32-NEXT: li s1, 1024
+; RV32-NEXT: lui s6, 1
+; RV32-NEXT: lui s5, 2
+; RV32-NEXT: lui s4, 4
+; RV32-NEXT: lui s3, 8
+; RV32-NEXT: lui s2, 16
+; RV32-NEXT: lui s0, 32
+; RV32-NEXT: lui t6, 64
+; RV32-NEXT: lui t4, 128
+; RV32-NEXT: lui t3, 256
+; RV32-NEXT: lui t2, 512
+; RV32-NEXT: lui t1, 1024
+; RV32-NEXT: lui t0, 2048
+; RV32-NEXT: lui a7, 4096
+; RV32-NEXT: lui a6, 8192
+; RV32-NEXT: lui a5, 16384
+; RV32-NEXT: lui a3, 32768
+; RV32-NEXT: sw a1, 272(sp)
+; RV32-NEXT: sw zero, 276(sp)
+; RV32-NEXT: sw zero, 264(sp)
+; RV32-NEXT: sw t5, 268(sp)
+; RV32-NEXT: sw zero, 256(sp)
+; RV32-NEXT: sw a4, 260(sp)
+; RV32-NEXT: lui a4, 65536
+; RV32-NEXT: sw zero, 248(sp)
+; RV32-NEXT: sw a2, 252(sp)
+; RV32-NEXT: lui a2, 131072
+; RV32-NEXT: sw zero, 240(sp)
+; RV32-NEXT: sw s11, 244(sp)
+; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV32-NEXT: vand.vi v13, v9, 2
+; RV32-NEXT: vand.vi v14, v9, 1
+; RV32-NEXT: vand.vi v12, v9, 4
+; RV32-NEXT: vand.vi v11, v9, 8
+; RV32-NEXT: sw zero, 232(sp)
+; RV32-NEXT: sw a0, 236(sp)
+; RV32-NEXT: vand.vx v10, v9, a0
+; RV32-NEXT: addi s11, sp, 272
+; RV32-NEXT: sw zero, 224(sp)
+; RV32-NEXT: sw ra, 228(sp)
+; RV32-NEXT: vand.vx v15, v9, ra
+; RV32-NEXT: addi ra, sp, 264
+; RV32-NEXT: sw zero, 216(sp)
+; RV32-NEXT: sw s10, 220(sp)
+; RV32-NEXT: vand.vx v16, v9, s10
+; RV32-NEXT: addi s10, sp, 256
+; RV32-NEXT: sw zero, 208(sp)
+; RV32-NEXT: sw s9, 212(sp)
+; RV32-NEXT: vand.vx v17, v9, s9
+; RV32-NEXT: addi s9, sp, 248
+; RV32-NEXT: sw zero, 200(sp)
+; RV32-NEXT: sw s8, 204(sp)
+; RV32-NEXT: vand.vx v18, v9, s8
+; RV32-NEXT: addi s8, sp, 240
+; RV32-NEXT: sw zero, 192(sp)
+; RV32-NEXT: sw s7, 196(sp)
+; RV32-NEXT: vand.vx v19, v9, s7
+; RV32-NEXT: addi s7, sp, 232
+; RV32-NEXT: sw zero, 184(sp)
+; RV32-NEXT: sw s1, 188(sp)
+; RV32-NEXT: vand.vx v20, v9, s1
+; RV32-NEXT: slli t5, t5, 11
+; RV32-NEXT: vand.vx v21, v9, s6
+; RV32-NEXT: sw zero, 176(sp)
+; RV32-NEXT: sw t5, 180(sp)
+; RV32-NEXT: sw zero, 168(sp)
+; RV32-NEXT: sw s6, 172(sp)
+; RV32-NEXT: addi s6, sp, 216
+; RV32-NEXT: vand.vx v22, v9, s5
+; RV32-NEXT: sw zero, 160(sp)
+; RV32-NEXT: sw s5, 164(sp)
+; RV32-NEXT: addi s5, sp, 208
+; RV32-NEXT: vand.vx v23, v9, s4
+; RV32-NEXT: sw zero, 152(sp)
+; RV32-NEXT: sw s4, 156(sp)
+; RV32-NEXT: addi s4, sp, 200
+; RV32-NEXT: vand.vx v24, v9, s3
+; RV32-NEXT: sw zero, 144(sp)
+; RV32-NEXT: sw s3, 148(sp)
+; RV32-NEXT: addi s3, sp, 192
+; RV32-NEXT: vand.vx v25, v9, s2
+; RV32-NEXT: sw zero, 136(sp)
+; RV32-NEXT: sw s2, 140(sp)
+; RV32-NEXT: addi s2, sp, 184
+; RV32-NEXT: vand.vx v26, v9, s0
+; RV32-NEXT: sw zero, 128(sp)
+; RV32-NEXT: sw s0, 132(sp)
+; RV32-NEXT: addi s1, sp, 176
+; RV32-NEXT: vand.vx v27, v9, t6
+; RV32-NEXT: sw zero, 120(sp)
+; RV32-NEXT: sw t6, 124(sp)
+; RV32-NEXT: addi s0, sp, 168
+; RV32-NEXT: vand.vx v28, v9, t4
+; RV32-NEXT: sw zero, 112(sp)
+; RV32-NEXT: sw t4, 116(sp)
+; RV32-NEXT: addi t6, sp, 160
+; RV32-NEXT: vand.vx v29, v9, t3
+; RV32-NEXT: sw zero, 104(sp)
+; RV32-NEXT: sw t3, 108(sp)
+; RV32-NEXT: addi t4, sp, 152
+; RV32-NEXT: vand.vx v30, v9, t2
+; RV32-NEXT: sw zero, 96(sp)
+; RV32-NEXT: sw t2, 100(sp)
+; RV32-NEXT: addi t3, sp, 144
+; RV32-NEXT: vand.vx v31, v9, t1
+; RV32-NEXT: sw zero, 88(sp)
+; RV32-NEXT: sw t1, 92(sp)
+; RV32-NEXT: addi t2, sp, 136
+; RV32-NEXT: vand.vx v7, v9, t0
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw t0, 84(sp)
+; RV32-NEXT: addi t1, sp, 128
+; RV32-NEXT: vand.vx v6, v9, a7
+; RV32-NEXT: sw zero, 72(sp)
+; RV32-NEXT: sw a7, 76(sp)
+; RV32-NEXT: addi t0, sp, 120
+; RV32-NEXT: vand.vx v5, v9, a6
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw a6, 68(sp)
+; RV32-NEXT: addi a7, sp, 112
+; RV32-NEXT: vand.vx v4, v9, a5
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw a5, 60(sp)
+; RV32-NEXT: addi a6, sp, 104
+; RV32-NEXT: vand.vx v3, v9, a3
+; RV32-NEXT: sw zero, 48(sp)
+; RV32-NEXT: sw a3, 52(sp)
+; RV32-NEXT: addi a5, sp, 96
+; RV32-NEXT: vand.vx v2, v9, a4
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw a4, 44(sp)
+; RV32-NEXT: addi a4, sp, 88
+; RV32-NEXT: vand.vx v1, v9, a2
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: addi a3, sp, 80
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: lui a0, 262144
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: addi a2, sp, 72
+; RV32-NEXT: vand.vx v0, v9, t5
+; RV32-NEXT: addi a1, sp, 64
+; RV32-NEXT: vmul.vv v13, v8, v13
+; RV32-NEXT: vmul.vv v14, v8, v14
+; RV32-NEXT: vxor.vi v14, v14, 0
+; RV32-NEXT: vxor.vv v14, v14, v13
+; RV32-NEXT: vlse64.v v13, (s11), zero
+; RV32-NEXT: addi s11, sp, 56
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v14, v14, v12
+; RV32-NEXT: vlse64.v v12, (ra), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: mv ra, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add t5, t5, ra
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs1r.v v12, (t5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi ra, sp, 48
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v14, v14, v11
+; RV32-NEXT: vlse64.v v11, (s10), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli s10, t5, 2
+; RV32-NEXT: add t5, s10, t5
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs1r.v v11, (t5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi s10, sp, 40
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v14, v14, v10
+; RV32-NEXT: vlse64.v v10, (s9), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs1r.v v10, (t5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi t5, sp, 32
+; RV32-NEXT: vmul.vv v15, v8, v15
+; RV32-NEXT: vxor.vv v15, v14, v15
+; RV32-NEXT: vlse64.v v10, (s8), zero
+; RV32-NEXT: csrr s8, vlenb
+; RV32-NEXT: slli s9, s8, 1
+; RV32-NEXT: add s8, s9, s8
+; RV32-NEXT: add s8, sp, s8
+; RV32-NEXT: addi s8, s8, 288
+; RV32-NEXT: vs1r.v v10, (s8) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi s8, sp, 24
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v16, v15, v16
+; RV32-NEXT: vlse64.v v10, (s7), zero
+; RV32-NEXT: csrr s7, vlenb
+; RV32-NEXT: slli s7, s7, 1
+; RV32-NEXT: add s7, sp, s7
+; RV32-NEXT: addi s7, s7, 288
+; RV32-NEXT: vs1r.v v10, (s7) # vscale x 8-byte Folded Spill
+; RV32-NEXT: addi s7, sp, 16
+; RV32-NEXT: vmul.vv v17, v8, v17
+; RV32-NEXT: vmul.vv v18, v8, v18
+; RV32-NEXT: vmul.vv v19, v8, v19
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vmul.vv v21, v8, v21
+; RV32-NEXT: vmul.vv v22, v8, v22
+; RV32-NEXT: vmul.vv v23, v8, v23
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v25, v8, v25
+; RV32-NEXT: vmul.vv v26, v8, v26
+; RV32-NEXT: vmul.vv v27, v8, v27
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vmul.vv v29, v8, v29
+; RV32-NEXT: vmul.vv v30, v8, v30
+; RV32-NEXT: vmul.vv v31, v8, v31
+; RV32-NEXT: vmul.vv v7, v8, v7
+; RV32-NEXT: vmul.vv v6, v8, v6
+; RV32-NEXT: vmul.vv v5, v8, v5
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vmul.vv v3, v8, v3
+; RV32-NEXT: vmul.vv v2, v8, v2
+; RV32-NEXT: vmul.vv v1, v8, v1
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v16, v16, v17
+; RV32-NEXT: addi s9, sp, 224
+; RV32-NEXT: vlse64.v v11, (s9), zero
+; RV32-NEXT: vxor.vv v16, v16, v18
+; RV32-NEXT: vlse64.v v10, (s6), zero
+; RV32-NEXT: csrr s6, vlenb
+; RV32-NEXT: add s6, sp, s6
+; RV32-NEXT: addi s6, s6, 288
+; RV32-NEXT: vs1r.v v10, (s6) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vxor.vv v16, v16, v19
+; RV32-NEXT: vlse64.v v10, (s5), zero
+; RV32-NEXT: addi s5, sp, 288
+; RV32-NEXT: vs1r.v v10, (s5) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vxor.vv v16, v16, v20
+; RV32-NEXT: vlse64.v v12, (s4), zero
+; RV32-NEXT: vxor.vv v16, v16, v0
+; RV32-NEXT: vlse64.v v0, (s3), zero
+; RV32-NEXT: vxor.vv v16, v16, v21
+; RV32-NEXT: vlse64.v v21, (s2), zero
+; RV32-NEXT: vxor.vv v16, v16, v22
+; RV32-NEXT: vlse64.v v22, (s1), zero
+; RV32-NEXT: vxor.vv v16, v16, v23
+; RV32-NEXT: vlse64.v v23, (s0), zero
+; RV32-NEXT: vxor.vv v16, v16, v24
+; RV32-NEXT: vlse64.v v24, (t6), zero
+; RV32-NEXT: vxor.vv v16, v16, v25
+; RV32-NEXT: vlse64.v v25, (t4), zero
+; RV32-NEXT: vxor.vv v16, v16, v26
+; RV32-NEXT: vlse64.v v26, (t3), zero
+; RV32-NEXT: vxor.vv v16, v16, v27
+; RV32-NEXT: vlse64.v v27, (t2), zero
+; RV32-NEXT: vxor.vv v16, v16, v28
+; RV32-NEXT: vlse64.v v28, (t1), zero
+; RV32-NEXT: vxor.vv v16, v16, v29
+; RV32-NEXT: vlse64.v v29, (t0), zero
+; RV32-NEXT: vxor.vv v16, v16, v30
+; RV32-NEXT: vlse64.v v30, (a7), zero
+; RV32-NEXT: vxor.vv v16, v16, v31
+; RV32-NEXT: vlse64.v v31, (a6), zero
+; RV32-NEXT: vxor.vv v16, v16, v7
+; RV32-NEXT: vlse64.v v7, (a5), zero
+; RV32-NEXT: vxor.vv v16, v16, v6
+; RV32-NEXT: vlse64.v v6, (a4), zero
+; RV32-NEXT: vxor.vv v16, v16, v5
+; RV32-NEXT: vlse64.v v5, (a3), zero
+; RV32-NEXT: vxor.vv v16, v16, v4
+; RV32-NEXT: vlse64.v v4, (a2), zero
+; RV32-NEXT: vxor.vv v16, v16, v3
+; RV32-NEXT: vlse64.v v3, (a1), zero
+; RV32-NEXT: vxor.vv v16, v16, v2
+; RV32-NEXT: vlse64.v v2, (s11), zero
+; RV32-NEXT: vxor.vv v1, v16, v1
+; RV32-NEXT: vlse64.v v10, (ra), zero
+; RV32-NEXT: vand.vv v13, v9, v13
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v14, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v14, v9, v14
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v15, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v15, v9, v15
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v16, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v16, v9, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v17, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v17, v9, v17
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v18, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v18, v9, v18
+; RV32-NEXT: vand.vv v19, v9, v11
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vl1r.v v11, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v20, v9, v11
+; RV32-NEXT: addi a1, sp, 288
+; RV32-NEXT: vl1r.v v11, (a1) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vand.vv v11, v9, v11
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v11, v9, v12
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 1
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v0, v9, v0
+; RV32-NEXT: vand.vv v21, v9, v21
+; RV32-NEXT: vand.vv v22, v9, v22
+; RV32-NEXT: vand.vv v23, v9, v23
+; RV32-NEXT: vand.vv v24, v9, v24
+; RV32-NEXT: vand.vv v25, v9, v25
+; RV32-NEXT: vand.vv v26, v9, v26
+; RV32-NEXT: vand.vv v27, v9, v27
+; RV32-NEXT: vand.vv v28, v9, v28
+; RV32-NEXT: vand.vv v29, v9, v29
+; RV32-NEXT: vand.vv v30, v9, v30
+; RV32-NEXT: vand.vv v31, v9, v31
+; RV32-NEXT: vand.vv v7, v9, v7
+; RV32-NEXT: vand.vv v6, v9, v6
+; RV32-NEXT: vand.vv v5, v9, v5
+; RV32-NEXT: vand.vv v4, v9, v4
+; RV32-NEXT: vand.vv v11, v9, v3
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v2, v9, v2
+; RV32-NEXT: vand.vv v10, v9, v10
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v10, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vlse64.v v10, (s10), zero
+; RV32-NEXT: vlse64.v v3, (t5), zero
+; RV32-NEXT: vlse64.v v11, (s8), zero
+; RV32-NEXT: vlse64.v v12, (s7), zero
+; RV32-NEXT: vand.vv v10, v9, v10
+; RV32-NEXT: vand.vv v3, v9, v3
+; RV32-NEXT: vand.vv v11, v9, v11
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs1r.v v11, (a1) # vscale x 8-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v9, v12
+; RV32-NEXT: vand.vx v9, v9, a0
+; RV32-NEXT: vmul.vv v9, v8, v9
+; RV32-NEXT: vxor.vv v9, v1, v9
+; RV32-NEXT: vmul.vv v11, v8, v13
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v14
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v15
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v16
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v17
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v18
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v19
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v20
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 1
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v0
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v21
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v22
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v23
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v24
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v25
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v26
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v27
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v28
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v29
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v30
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v31
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v7
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v6
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v5
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v4
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v11, v8, v2
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v11, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v11, v8, v11
+; RV32-NEXT: vxor.vv v9, v9, v11
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v9, v9, v10
+; RV32-NEXT: vmul.vv v10, v8, v3
+; RV32-NEXT: vxor.vv v9, v9, v10
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 2
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl1r.v v10, (a0) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v9, v9, v10
+; RV32-NEXT: vmul.vv v8, v8, v12
+; RV32-NEXT: vxor.vv v8, v9, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 3
+; RV32-NEXT: sub a0, a1, a0
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 348(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 344(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 340(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 336(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 332(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 328(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 324(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 320(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 316(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 312(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 308(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 304(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 300(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 352
+; RV32-NEXT: ret
+;
+; RV64-LABEL: clmul_v2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT: vand.vi v10, v9, 2
+; RV64-NEXT: vand.vi v11, v9, 1
+; RV64-NEXT: vmul.vv v10, v8, v10
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v11, v10
+; RV64-NEXT: vand.vi v11, v9, 4
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vi v11, v9, 8
+; RV64-NEXT: li a0, 16
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: li a0, 64
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: li a0, 256
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a1, 512
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: li a2, 1024
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a2
+; RV64-NEXT: slli a1, a0, 11
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 2
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 16
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 32
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 64
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 128
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 256
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 512
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 1024
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 2048
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 4096
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 8192
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 16384
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 32768
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 65536
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 131072
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: lui a1, 262144
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 31
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 32
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 33
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 34
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 35
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 36
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 37
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 38
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 39
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 40
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 41
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 42
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 43
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 44
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 45
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 46
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 47
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 49
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 50
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 51
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 52
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 53
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 54
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 55
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 56
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 57
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 58
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 59
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 60
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: slli a1, a0, 61
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a1
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: slli a0, a0, 62
+; RV64-NEXT: slli a1, a1, 63
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vand.vx v11, v9, a0
+; RV64-NEXT: vand.vx v9, v9, a1
+; RV64-NEXT: vmul.vv v11, v8, v11
+; RV64-NEXT: vxor.vv v10, v10, v11
+; RV64-NEXT: vmul.vv v8, v8, v9
+; RV64-NEXT: vxor.vv v8, v10, v8
+; RV64-NEXT: ret
+ %a = call <2 x i64> @llvm.clmul.v2i64(<2 x i64> %x, <2 x i64> %y)
+ ret <2 x i64> %a
+}
+
+define <4 x i64> @clmul_v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
+; RV32-LABEL: clmul_v4i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -352
+; RV32-NEXT: sw ra, 348(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 344(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 340(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 336(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 332(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 328(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 324(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 320(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 316(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 312(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 308(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 304(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 300(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: lui a1, 524288
+; RV32-NEXT: li s2, 1
+; RV32-NEXT: li a3, 2
+; RV32-NEXT: li a2, 4
+; RV32-NEXT: li s7, 8
+; RV32-NEXT: li a0, 16
+; RV32-NEXT: li s6, 32
+; RV32-NEXT: li s5, 64
+; RV32-NEXT: li s4, 128
+; RV32-NEXT: li s1, 256
+; RV32-NEXT: li s0, 512
+; RV32-NEXT: li t5, 1024
+; RV32-NEXT: lui ra, 1
+; RV32-NEXT: lui s8, 2
+; RV32-NEXT: lui s10, 4
+; RV32-NEXT: lui s11, 8
+; RV32-NEXT: lui s9, 16
+; RV32-NEXT: lui s3, 32
+; RV32-NEXT: lui t6, 64
+; RV32-NEXT: lui t4, 128
+; RV32-NEXT: lui t3, 256
+; RV32-NEXT: lui t2, 512
+; RV32-NEXT: lui t1, 1024
+; RV32-NEXT: lui t0, 2048
+; RV32-NEXT: lui a7, 4096
+; RV32-NEXT: lui a6, 8192
+; RV32-NEXT: lui a5, 16384
+; RV32-NEXT: lui a4, 32768
+; RV32-NEXT: sw a1, 272(sp)
+; RV32-NEXT: sw zero, 276(sp)
+; RV32-NEXT: sw zero, 264(sp)
+; RV32-NEXT: sw s2, 268(sp)
+; RV32-NEXT: sw zero, 256(sp)
+; RV32-NEXT: sw a3, 260(sp)
+; RV32-NEXT: lui a3, 65536
+; RV32-NEXT: sw zero, 248(sp)
+; RV32-NEXT: sw a2, 252(sp)
+; RV32-NEXT: lui a2, 131072
+; RV32-NEXT: sw zero, 240(sp)
+; RV32-NEXT: sw s7, 244(sp)
+; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT: vand.vi v28, v10, 2
+; RV32-NEXT: vand.vi v20, v10, 1
+; RV32-NEXT: vand.vi v30, v10, 4
+; RV32-NEXT: vand.vi v14, v10, 8
+; RV32-NEXT: sw zero, 232(sp)
+; RV32-NEXT: sw a0, 236(sp)
+; RV32-NEXT: vand.vx v12, v10, a0
+; RV32-NEXT: addi s7, sp, 272
+; RV32-NEXT: sw zero, 224(sp)
+; RV32-NEXT: sw s6, 228(sp)
+; RV32-NEXT: vand.vx v16, v10, s6
+; RV32-NEXT: addi s6, sp, 264
+; RV32-NEXT: sw zero, 216(sp)
+; RV32-NEXT: sw s5, 220(sp)
+; RV32-NEXT: vand.vx v18, v10, s5
+; RV32-NEXT: addi s5, sp, 256
+; RV32-NEXT: sw zero, 208(sp)
+; RV32-NEXT: sw s4, 212(sp)
+; RV32-NEXT: vand.vx v0, v10, s4
+; RV32-NEXT: addi s4, sp, 248
+; RV32-NEXT: sw zero, 200(sp)
+; RV32-NEXT: sw s1, 204(sp)
+; RV32-NEXT: vand.vx v6, v10, s1
+; RV32-NEXT: addi s1, sp, 240
+; RV32-NEXT: sw zero, 192(sp)
+; RV32-NEXT: sw s0, 196(sp)
+; RV32-NEXT: vand.vx v4, v10, s0
+; RV32-NEXT: addi s0, sp, 232
+; RV32-NEXT: sw zero, 184(sp)
+; RV32-NEXT: sw t5, 188(sp)
+; RV32-NEXT: vand.vx v2, v10, t5
+; RV32-NEXT: slli s2, s2, 11
+; RV32-NEXT: vand.vx v24, v10, ra
+; RV32-NEXT: sw zero, 176(sp)
+; RV32-NEXT: sw s2, 180(sp)
+; RV32-NEXT: sw zero, 168(sp)
+; RV32-NEXT: sw ra, 172(sp)
+; RV32-NEXT: addi t5, sp, 216
+; RV32-NEXT: vand.vx v26, v10, s8
+; RV32-NEXT: sw zero, 160(sp)
+; RV32-NEXT: sw s8, 164(sp)
+; RV32-NEXT: addi s8, sp, 208
+; RV32-NEXT: vand.vx v22, v10, s10
+; RV32-NEXT: sw zero, 152(sp)
+; RV32-NEXT: sw s10, 156(sp)
+; RV32-NEXT: addi s10, sp, 200
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vi v20, v20, 0
+; RV32-NEXT: vxor.vv v20, v20, v28
+; RV32-NEXT: vand.vx v28, v10, s11
+; RV32-NEXT: sw zero, 144(sp)
+; RV32-NEXT: sw s11, 148(sp)
+; RV32-NEXT: addi s11, sp, 192
+; RV32-NEXT: vmul.vv v30, v8, v30
+; RV32-NEXT: vxor.vv v20, v20, v30
+; RV32-NEXT: vand.vx v30, v10, s9
+; RV32-NEXT: sw zero, 136(sp)
+; RV32-NEXT: sw s9, 140(sp)
+; RV32-NEXT: addi s9, sp, 184
+; RV32-NEXT: vmul.vv v14, v8, v14
+; RV32-NEXT: vxor.vv v14, v20, v14
+; RV32-NEXT: vand.vx v20, v10, s3
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv ra, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, ra
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs2r.v v20, (a0) # vscale x 16-byte Folded Spill
+; RV32-NEXT: sw zero, 128(sp)
+; RV32-NEXT: sw s3, 132(sp)
+; RV32-NEXT: addi s3, sp, 176
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v12, v14, v12
+; RV32-NEXT: vand.vx v14, v10, t6
+; RV32-NEXT: sw zero, 120(sp)
+; RV32-NEXT: sw t6, 124(sp)
+; RV32-NEXT: addi t6, sp, 168
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: vand.vx v16, v10, t4
+; RV32-NEXT: sw zero, 112(sp)
+; RV32-NEXT: sw t4, 116(sp)
+; RV32-NEXT: addi t4, sp, 160
+; RV32-NEXT: vmul.vv v18, v8, v18
+; RV32-NEXT: vxor.vv v18, v12, v18
+; RV32-NEXT: vand.vx v12, v10, t3
+; RV32-NEXT: sw zero, 104(sp)
+; RV32-NEXT: sw t3, 108(sp)
+; RV32-NEXT: addi t3, sp, 152
+; RV32-NEXT: vmul.vv v20, v8, v0
+; RV32-NEXT: vxor.vv v18, v18, v20
+; RV32-NEXT: vand.vx v20, v10, t2
+; RV32-NEXT: sw zero, 96(sp)
+; RV32-NEXT: sw t2, 100(sp)
+; RV32-NEXT: addi t2, sp, 144
+; RV32-NEXT: vmul.vv v6, v8, v6
+; RV32-NEXT: vxor.vv v18, v18, v6
+; RV32-NEXT: vand.vx v6, v10, t1
+; RV32-NEXT: sw zero, 88(sp)
+; RV32-NEXT: sw t1, 92(sp)
+; RV32-NEXT: addi t1, sp, 136
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vxor.vv v18, v18, v4
+; RV32-NEXT: vand.vx v4, v10, t0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv ra, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add ra, ra, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, ra
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs2r.v v4, (a0) # vscale x 16-byte Folded Spill
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw t0, 84(sp)
+; RV32-NEXT: addi t0, sp, 128
+; RV32-NEXT: vmul.vv v2, v8, v2
+; RV32-NEXT: vxor.vv v18, v18, v2
+; RV32-NEXT: vand.vx v2, v10, s2
+; RV32-NEXT: addi ra, sp, 120
+; RV32-NEXT: vmul.vv v2, v8, v2
+; RV32-NEXT: vxor.vv v18, v18, v2
+; RV32-NEXT: vand.vx v2, v10, a7
+; RV32-NEXT: sw zero, 72(sp)
+; RV32-NEXT: sw a7, 76(sp)
+; RV32-NEXT: addi a7, sp, 112
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v18, v18, v24
+; RV32-NEXT: vand.vx v4, v10, a6
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw a6, 68(sp)
+; RV32-NEXT: addi a6, sp, 104
+; RV32-NEXT: vmul.vv v26, v8, v26
+; RV32-NEXT: vxor.vv v18, v18, v26
+; RV32-NEXT: vand.vx v26, v10, a5
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw a5, 60(sp)
+; RV32-NEXT: addi a5, sp, 96
+; RV32-NEXT: vmul.vv v22, v8, v22
+; RV32-NEXT: vxor.vv v18, v18, v22
+; RV32-NEXT: vand.vx v24, v10, a4
+; RV32-NEXT: sw zero, 48(sp)
+; RV32-NEXT: sw a4, 52(sp)
+; RV32-NEXT: addi a4, sp, 88
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vxor.vv v18, v18, v28
+; RV32-NEXT: vand.vx v28, v10, a3
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw a3, 44(sp)
+; RV32-NEXT: addi a3, sp, 80
+; RV32-NEXT: vmul.vv v30, v8, v30
+; RV32-NEXT: vxor.vv v18, v18, v30
+; RV32-NEXT: vand.vx v30, v10, a2
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: addi a2, sp, 72
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: lui a0, 262144
+; RV32-NEXT: sw a0, 28(sp)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: sw a1, 20(sp)
+; RV32-NEXT: addi a1, sp, 64
+; RV32-NEXT: sw a6, 4(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 3
+; RV32-NEXT: mv s2, a6
+; RV32-NEXT: slli a6, a6, 2
+; RV32-NEXT: add a6, a6, s2
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vl2r.v v22, (a6) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v0, v8, v22
+; RV32-NEXT: vxor.vv v0, v18, v0
+; RV32-NEXT: vlse64.v v18, (s7), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 3
+; RV32-NEXT: mv s2, a6
+; RV32-NEXT: slli a6, a6, 2
+; RV32-NEXT: add a6, a6, s2
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v18, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s7, sp, 56
+; RV32-NEXT: vmul.vv v14, v8, v14
+; RV32-NEXT: vxor.vv v14, v0, v14
+; RV32-NEXT: vlse64.v v18, (s6), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 2
+; RV32-NEXT: mv s2, a6
+; RV32-NEXT: slli a6, a6, 3
+; RV32-NEXT: add a6, a6, s2
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v18, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s2, sp, 48
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v14, v14, v16
+; RV32-NEXT: vlse64.v v16, (s5), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: mv s5, a6
+; RV32-NEXT: slli a6, a6, 4
+; RV32-NEXT: add a6, a6, s5
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v16, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s5, sp, 40
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v12, v14, v12
+; RV32-NEXT: vlse64.v v14, (s4), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 5
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v14, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s4, sp, 32
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v20, v12, v20
+; RV32-NEXT: vlse64.v v12, (s1), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: mv s1, a6
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: add s1, s1, a6
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: add s1, s1, a6
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: add a6, a6, s1
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v12, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s1, sp, 24
+; RV32-NEXT: vmul.vv v6, v8, v6
+; RV32-NEXT: vxor.vv v20, v20, v6
+; RV32-NEXT: vlse64.v v12, (s0), zero
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: slli a6, a6, 2
+; RV32-NEXT: mv s0, a6
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: add s0, s0, a6
+; RV32-NEXT: slli a6, a6, 1
+; RV32-NEXT: add a6, a6, s0
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 288
+; RV32-NEXT: vs2r.v v12, (a6) # vscale x 16-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 16
+; RV32-NEXT: csrr s6, vlenb
+; RV32-NEXT: slli s6, s6, 1
+; RV32-NEXT: mv a6, s6
+; RV32-NEXT: slli s6, s6, 1
+; RV32-NEXT: add a6, a6, s6
+; RV32-NEXT: slli s6, s6, 3
+; RV32-NEXT: add s6, s6, a6
+; RV32-NEXT: lw a6, 4(sp) # 4-byte Folded Reload
+; RV32-NEXT: add s6, sp, s6
+; RV32-NEXT: addi s6, s6, 288
+; RV32-NEXT: vl2r.v v12, (s6) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v6, v8, v12
+; RV32-NEXT: vmul.vv v2, v8, v2
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vmul.vv v26, v8, v26
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vmul.vv v30, v8, v30
+; RV32-NEXT: vxor.vv v20, v20, v6
+; RV32-NEXT: addi s6, sp, 224
+; RV32-NEXT: vlse64.v v0, (s6), zero
+; RV32-NEXT: vxor.vv v20, v20, v2
+; RV32-NEXT: vlse64.v v6, (t5), zero
+; RV32-NEXT: vxor.vv v20, v20, v4
+; RV32-NEXT: vlse64.v v22, (s8), zero
+; RV32-NEXT: vxor.vv v20, v20, v26
+; RV32-NEXT: vlse64.v v18, (s10), zero
+; RV32-NEXT: vxor.vv v20, v20, v24
+; RV32-NEXT: vlse64.v v16, (s11), zero
+; RV32-NEXT: vxor.vv v20, v20, v28
+; RV32-NEXT: vlse64.v v14, (s9), zero
+; RV32-NEXT: vxor.vv v2, v20, v30
+; RV32-NEXT: vlse64.v v12, (s3), zero
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 3
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v26, v10, v20
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 3
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v4, v10, v20
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 4
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v30, v10, v20
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 5
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v20, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v20, v10, v20
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add s3, s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add s3, s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v24, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v28, v10, v24
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add s3, s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vl2r.v v24, (t5) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vand.vv v24, v10, v24
+; RV32-NEXT: vand.vv v0, v10, v0
+; RV32-NEXT: vand.vv v6, v10, v6
+; RV32-NEXT: vand.vv v22, v10, v22
+; RV32-NEXT: vand.vv v18, v10, v18
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs2r.v v18, (t5) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v10, v16
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs2r.v v16, (t5) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v14, v10, v14
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add s3, s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add s3, s3, t5
+; RV32-NEXT: slli t5, t5, 1
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs2r.v v14, (t5) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v12
+; RV32-NEXT: csrr t5, vlenb
+; RV32-NEXT: slli t5, t5, 3
+; RV32-NEXT: mv s3, t5
+; RV32-NEXT: slli t5, t5, 2
+; RV32-NEXT: add t5, t5, s3
+; RV32-NEXT: add t5, sp, t5
+; RV32-NEXT: addi t5, t5, 288
+; RV32-NEXT: vs2r.v v12, (t5) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vlse64.v v12, (t6), zero
+; RV32-NEXT: vlse64.v v14, (t4), zero
+; RV32-NEXT: vlse64.v v16, (t3), zero
+; RV32-NEXT: vlse64.v v18, (t2), zero
+; RV32-NEXT: vand.vv v12, v10, v12
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v14
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v16
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 2
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t3, t3, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v18
+; RV32-NEXT: csrr t2, vlenb
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: mv t3, t2
+; RV32-NEXT: slli t2, t2, 1
+; RV32-NEXT: add t3, t3, t2
+; RV32-NEXT: slli t2, t2, 3
+; RV32-NEXT: add t2, t2, t3
+; RV32-NEXT: add t2, sp, t2
+; RV32-NEXT: addi t2, t2, 288
+; RV32-NEXT: vs2r.v v12, (t2) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vlse64.v v12, (t1), zero
+; RV32-NEXT: vlse64.v v14, (t0), zero
+; RV32-NEXT: vlse64.v v16, (ra), zero
+; RV32-NEXT: vlse64.v v18, (a7), zero
+; RV32-NEXT: vand.vv v12, v10, v12
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v14
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 4
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v16
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 1
+; RV32-NEXT: mv t0, a7
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: add t0, t0, a7
+; RV32-NEXT: slli a7, a7, 1
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v18
+; RV32-NEXT: csrr a7, vlenb
+; RV32-NEXT: slli a7, a7, 2
+; RV32-NEXT: mv t0, a7
+; RV32-NEXT: slli a7, a7, 3
+; RV32-NEXT: add a7, a7, t0
+; RV32-NEXT: add a7, sp, a7
+; RV32-NEXT: addi a7, a7, 288
+; RV32-NEXT: vs2r.v v12, (a7) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vlse64.v v12, (a6), zero
+; RV32-NEXT: vlse64.v v14, (a5), zero
+; RV32-NEXT: vlse64.v v16, (a4), zero
+; RV32-NEXT: vlse64.v v18, (a3), zero
+; RV32-NEXT: vand.vv v12, v10, v12
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 288
+; RV32-NEXT: vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v14
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: mv a4, a3
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: add a4, a4, a3
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 288
+; RV32-NEXT: vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v16
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 3
+; RV32-NEXT: mv a4, a3
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 288
+; RV32-NEXT: vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v18
+; RV32-NEXT: csrr a3, vlenb
+; RV32-NEXT: slli a3, a3, 1
+; RV32-NEXT: mv a4, a3
+; RV32-NEXT: slli a3, a3, 4
+; RV32-NEXT: add a3, a3, a4
+; RV32-NEXT: add a3, sp, a3
+; RV32-NEXT: addi a3, a3, 288
+; RV32-NEXT: vs2r.v v12, (a3) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vlse64.v v12, (a2), zero
+; RV32-NEXT: vlse64.v v14, (a1), zero
+; RV32-NEXT: vlse64.v v16, (s7), zero
+; RV32-NEXT: vlse64.v v18, (s2), zero
+; RV32-NEXT: vand.vv v12, v10, v12
+; RV32-NEXT: addi a1, sp, 288
+; RV32-NEXT: vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v14
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a2, a2, a1
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v12, v10, v18
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 5
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs2r.v v12, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vlse64.v v14, (s5), zero
+; RV32-NEXT: vlse64.v v16, (s4), zero
+; RV32-NEXT: vlse64.v v18, (s1), zero
+; RV32-NEXT: vlse64.v v12, (s0), zero
+; RV32-NEXT: vand.vv v14, v10, v14
+; RV32-NEXT: vand.vv v16, v10, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: mv a2, a1
+; RV32-NEXT: slli a1, a1, 2
+; RV32-NEXT: add a1, a1, a2
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 288
+; RV32-NEXT: vs2r.v v16, (a1) # vscale x 16-byte Folded Spill
+; RV32-NEXT: vand.vv v18, v10, v18
+; RV32-NEXT: vand.vv v16, v10, v12
+; RV32-NEXT: vand.vx v10, v10, a0
+; RV32-NEXT: vmul.vv v10, v8, v10
+; RV32-NEXT: vxor.vv v10, v2, v10
+; RV32-NEXT: vmul.vv v12, v8, v26
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v4
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v30
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v20
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v28
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v24
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v0
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v6
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v22
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: addi a0, sp, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v14
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v12, v8, v18
+; RV32-NEXT: vxor.vv v10, v10, v12
+; RV32-NEXT: vmul.vv v8, v8, v16
+; RV32-NEXT: vxor.vv v8, v10, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 348(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 344(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 340(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 336(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 332(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 328(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 324(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 320(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 316(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 312(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 308(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 304(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 300(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 352
+; RV32-NEXT: ret
+;
+; RV64-LABEL: clmul_v4i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT: vand.vi v12, v10, 2
+; RV64-NEXT: vand.vi v14, v10, 1
+; RV64-NEXT: vmul.vv v12, v8, v12
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v14, v12
+; RV64-NEXT: vand.vi v14, v10, 4
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vi v14, v10, 8
+; RV64-NEXT: li a0, 16
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a0
+; RV64-NEXT: li a0, 64
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a0
+; RV64-NEXT: li a0, 256
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: li a1, 512
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a0
+; RV64-NEXT: li a2, 1024
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a2
+; RV64-NEXT: slli a1, a0, 11
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 2
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 16
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 32
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 64
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 128
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 256
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 512
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 1024
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 2048
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 4096
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 8192
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 16384
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 32768
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 65536
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 131072
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: lui a1, 262144
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 31
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 32
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 33
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 34
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 35
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 36
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 37
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 38
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 39
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 40
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 41
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 42
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 43
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 44
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 45
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 46
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 47
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 49
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 50
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 51
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 52
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 53
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 54
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 55
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 56
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 57
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 58
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 59
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 60
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: slli a1, a0, 61
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a1
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: slli a0, a0, 62
+; RV64-NEXT: slli a1, a1, 63
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vand.vx v14, v10, a0
+; RV64-NEXT: vand.vx v10, v10, a1
+; RV64-NEXT: vmul.vv v14, v8, v14
+; RV64-NEXT: vxor.vv v12, v12, v14
+; RV64-NEXT: vmul.vv v8, v8, v10
+; RV64-NEXT: vxor.vv v8, v12, v8
+; RV64-NEXT: ret
+ %a = call <4 x i64> @llvm.clmul.v4i64(<4 x i64> %x, <4 x i64> %y)
+ ret <4 x i64> %a
+}
+
+define <8 x i64> @clmul_v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
+; RV32-LABEL: clmul_v8i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -352
+; RV32-NEXT: sw ra, 348(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 344(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 340(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 336(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 332(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 328(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s5, 324(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s6, 320(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s7, 316(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s8, 312(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s9, 308(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s10, 304(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s11, 300(sp) # 4-byte Folded Spill
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: lui a1, 524288
+; RV32-NEXT: li s4, 1
+; RV32-NEXT: li a3, 2
+; RV32-NEXT: li a2, 4
+; RV32-NEXT: li a0, 8
+; RV32-NEXT: li s3, 16
+; RV32-NEXT: li s2, 32
+; RV32-NEXT: li s5, 64
+; RV32-NEXT: li s6, 128
+; RV32-NEXT: li s8, 256
+; RV32-NEXT: li s1, 512
+; RV32-NEXT: li s7, 1024
+; RV32-NEXT: lui ra, 1
+; RV32-NEXT: lui s11, 2
+; RV32-NEXT: lui s10, 4
+; RV32-NEXT: lui s9, 8
+; RV32-NEXT: lui s0, 16
+; RV32-NEXT: lui t6, 32
+; RV32-NEXT: lui t5, 64
+; RV32-NEXT: lui t4, 128
+; RV32-NEXT: lui t3, 256
+; RV32-NEXT: lui t2, 512
+; RV32-NEXT: lui t1, 1024
+; RV32-NEXT: lui t0, 2048
+; RV32-NEXT: lui a7, 4096
+; RV32-NEXT: lui a6, 8192
+; RV32-NEXT: lui a5, 16384
+; RV32-NEXT: lui a4, 32768
+; RV32-NEXT: sw a1, 272(sp)
+; RV32-NEXT: sw zero, 276(sp)
+; RV32-NEXT: sw zero, 264(sp)
+; RV32-NEXT: sw s4, 268(sp)
+; RV32-NEXT: sw zero, 256(sp)
+; RV32-NEXT: sw a3, 260(sp)
+; RV32-NEXT: lui a3, 65536
+; RV32-NEXT: sw zero, 248(sp)
+; RV32-NEXT: sw a2, 252(sp)
+; RV32-NEXT: lui a2, 131072
+; RV32-NEXT: sw zero, 240(sp)
+; RV32-NEXT: sw a0, 244(sp)
+; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV32-NEXT: vand.vi v28, v12, 2
+; RV32-NEXT: vand.vi v4, v12, 1
+; RV32-NEXT: vand.vi v24, v12, 4
+; RV32-NEXT: vand.vi v20, v12, 8
+; RV32-NEXT: sw zero, 232(sp)
+; RV32-NEXT: sw s3, 236(sp)
+; RV32-NEXT: vand.vx v16, v12, s3
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: addi s3, sp, 272
+; RV32-NEXT: sw zero, 224(sp)
+; RV32-NEXT: sw s2, 228(sp)
+; RV32-NEXT: vand.vx v0, v12, s2
+; RV32-NEXT: addi s2, sp, 264
+; RV32-NEXT: sw zero, 216(sp)
+; RV32-NEXT: sw s5, 220(sp)
+; RV32-NEXT: vmul.vv v16, v8, v28
+; RV32-NEXT: vmul.vv v28, v8, v4
+; RV32-NEXT: vxor.vi v28, v28, 0
+; RV32-NEXT: vxor.vv v28, v28, v16
+; RV32-NEXT: vand.vx v16, v12, s5
+; RV32-NEXT: addi s5, sp, 256
+; RV32-NEXT: sw zero, 208(sp)
+; RV32-NEXT: sw s6, 212(sp)
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v28, v28, v24
+; RV32-NEXT: vand.vx v24, v12, s6
+; RV32-NEXT: addi s6, sp, 248
+; RV32-NEXT: sw zero, 200(sp)
+; RV32-NEXT: sw s8, 204(sp)
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v20, v28, v20
+; RV32-NEXT: vand.vx v28, v12, s8
+; RV32-NEXT: addi s8, sp, 240
+; RV32-NEXT: sw zero, 192(sp)
+; RV32-NEXT: sw s1, 196(sp)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v4, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vxor.vv v20, v20, v4
+; RV32-NEXT: vand.vx v4, v12, s1
+; RV32-NEXT: sw zero, 184(sp)
+; RV32-NEXT: sw s7, 188(sp)
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v20, v20, v0
+; RV32-NEXT: vand.vx v0, v12, s7
+; RV32-NEXT: slli a0, s4, 11
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v20, v20, v16
+; RV32-NEXT: vand.vx v16, v12, ra
+; RV32-NEXT: sw zero, 176(sp)
+; RV32-NEXT: sw a0, 180(sp)
+; RV32-NEXT: sw zero, 168(sp)
+; RV32-NEXT: sw ra, 172(sp)
+; RV32-NEXT: addi s4, sp, 216
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v24, v20, v24
+; RV32-NEXT: vand.vx v20, v12, s11
+; RV32-NEXT: sw zero, 160(sp)
+; RV32-NEXT: sw s11, 164(sp)
+; RV32-NEXT: addi s11, sp, 208
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vxor.vv v28, v24, v28
+; RV32-NEXT: vand.vx v24, v12, s10
+; RV32-NEXT: sw zero, 152(sp)
+; RV32-NEXT: sw s10, 156(sp)
+; RV32-NEXT: addi s10, sp, 200
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vxor.vv v4, v28, v4
+; RV32-NEXT: vand.vx v28, v12, s9
+; RV32-NEXT: sw zero, 144(sp)
+; RV32-NEXT: sw s9, 148(sp)
+; RV32-NEXT: addi s9, sp, 192
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v4, v4, v0
+; RV32-NEXT: vand.vx v0, v12, a0
+; RV32-NEXT: addi ra, sp, 184
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v0, v4, v0
+; RV32-NEXT: vand.vx v4, v12, s0
+; RV32-NEXT: sw zero, 136(sp)
+; RV32-NEXT: sw s0, 140(sp)
+; RV32-NEXT: addi s1, sp, 176
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v0, v0, v16
+; RV32-NEXT: vand.vx v16, v12, t6
+; RV32-NEXT: sw zero, 128(sp)
+; RV32-NEXT: sw t6, 132(sp)
+; RV32-NEXT: addi s0, sp, 168
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v0, v0, v20
+; RV32-NEXT: vand.vx v20, v12, t5
+; RV32-NEXT: sw zero, 120(sp)
+; RV32-NEXT: sw t5, 124(sp)
+; RV32-NEXT: addi t6, sp, 160
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v0, v0, v24
+; RV32-NEXT: vand.vx v24, v12, t4
+; RV32-NEXT: sw zero, 112(sp)
+; RV32-NEXT: sw t4, 116(sp)
+; RV32-NEXT: addi t5, sp, 152
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vxor.vv v0, v0, v28
+; RV32-NEXT: vand.vx v28, v12, t3
+; RV32-NEXT: sw zero, 104(sp)
+; RV32-NEXT: sw t3, 108(sp)
+; RV32-NEXT: addi t4, sp, 144
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vxor.vv v0, v0, v4
+; RV32-NEXT: vand.vx v4, v12, t2
+; RV32-NEXT: sw zero, 96(sp)
+; RV32-NEXT: sw t2, 100(sp)
+; RV32-NEXT: addi t3, sp, 136
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v16, v0, v16
+; RV32-NEXT: vand.vx v0, v12, t1
+; RV32-NEXT: sw zero, 88(sp)
+; RV32-NEXT: sw t1, 92(sp)
+; RV32-NEXT: addi t2, sp, 128
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: vand.vx v16, v12, t0
+; RV32-NEXT: sw zero, 80(sp)
+; RV32-NEXT: sw t0, 84(sp)
+; RV32-NEXT: addi t1, sp, 120
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v24, v20, v24
+; RV32-NEXT: vand.vx v20, v12, a7
+; RV32-NEXT: sw zero, 72(sp)
+; RV32-NEXT: sw a7, 76(sp)
+; RV32-NEXT: addi t0, sp, 112
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vxor.vv v24, v24, v28
+; RV32-NEXT: vand.vx v28, v12, a6
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: sw zero, 64(sp)
+; RV32-NEXT: sw a6, 68(sp)
+; RV32-NEXT: addi a7, sp, 104
+; RV32-NEXT: vmul.vv v28, v8, v4
+; RV32-NEXT: vxor.vv v24, v24, v28
+; RV32-NEXT: vand.vx v28, v12, a5
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v28, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: sw zero, 56(sp)
+; RV32-NEXT: sw a5, 60(sp)
+; RV32-NEXT: addi a6, sp, 96
+; RV32-NEXT: vmul.vv v28, v8, v0
+; RV32-NEXT: vxor.vv v28, v24, v28
+; RV32-NEXT: vand.vx v24, v12, a4
+; RV32-NEXT: sw zero, 48(sp)
+; RV32-NEXT: sw a4, 52(sp)
+; RV32-NEXT: addi a5, sp, 88
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v16, v28, v16
+; RV32-NEXT: vand.vx v28, v12, a3
+; RV32-NEXT: sw zero, 40(sp)
+; RV32-NEXT: sw a3, 44(sp)
+; RV32-NEXT: addi a4, sp, 80
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v16, v16, v20
+; RV32-NEXT: vand.vx v4, v12, a2
+; RV32-NEXT: sw zero, 32(sp)
+; RV32-NEXT: sw a2, 36(sp)
+; RV32-NEXT: addi a3, sp, 72
+; RV32-NEXT: sw zero, 24(sp)
+; RV32-NEXT: lui a1, 262144
+; RV32-NEXT: sw a1, 28(sp)
+; RV32-NEXT: sw zero, 16(sp)
+; RV32-NEXT: lui a0, 524288
+; RV32-NEXT: sw a0, 20(sp)
+; RV32-NEXT: addi a2, sp, 64
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v20, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v20, v8, v20
+; RV32-NEXT: vxor.vv v20, v16, v20
+; RV32-NEXT: vlse64.v v16, (s3), zero
+; RV32-NEXT: addi s3, sp, 56
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v0, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v0, v8, v0
+; RV32-NEXT: vxor.vv v0, v20, v0
+; RV32-NEXT: vlse64.v v20, (s2), zero
+; RV32-NEXT: addi s2, sp, 48
+; RV32-NEXT: vmul.vv v24, v8, v24
+; RV32-NEXT: vxor.vv v0, v0, v24
+; RV32-NEXT: vlse64.v v24, (s5), zero
+; RV32-NEXT: addi s5, sp, 40
+; RV32-NEXT: vmul.vv v28, v8, v28
+; RV32-NEXT: vxor.vv v0, v0, v28
+; RV32-NEXT: vlse64.v v28, (s6), zero
+; RV32-NEXT: addi s6, sp, 32
+; RV32-NEXT: vmul.vv v4, v8, v4
+; RV32-NEXT: vxor.vv v4, v0, v4
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v4, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v4, (s8), zero
+; RV32-NEXT: addi s8, sp, 24
+; RV32-NEXT: vand.vv v16, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v28
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v4
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add s7, s7, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, s7
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: addi s7, sp, 232
+; RV32-NEXT: vlse64.v v16, (s7), zero
+; RV32-NEXT: addi s7, sp, 224
+; RV32-NEXT: vlse64.v v20, (s7), zero
+; RV32-NEXT: vlse64.v v24, (s4), zero
+; RV32-NEXT: vlse64.v v28, (s11), zero
+; RV32-NEXT: vand.vv v16, v12, v16
+; RV32-NEXT: csrr s4, vlenb
+; RV32-NEXT: slli s4, s4, 4
+; RV32-NEXT: add s4, sp, s4
+; RV32-NEXT: addi s4, s4, 288
+; RV32-NEXT: vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr s4, vlenb
+; RV32-NEXT: slli s4, s4, 2
+; RV32-NEXT: mv s7, s4
+; RV32-NEXT: slli s4, s4, 1
+; RV32-NEXT: add s7, s7, s4
+; RV32-NEXT: slli s4, s4, 2
+; RV32-NEXT: add s4, s4, s7
+; RV32-NEXT: add s4, sp, s4
+; RV32-NEXT: addi s4, s4, 288
+; RV32-NEXT: vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr s4, vlenb
+; RV32-NEXT: slli s4, s4, 2
+; RV32-NEXT: mv s7, s4
+; RV32-NEXT: slli s4, s4, 4
+; RV32-NEXT: add s4, s4, s7
+; RV32-NEXT: add s4, sp, s4
+; RV32-NEXT: addi s4, s4, 288
+; RV32-NEXT: vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v28
+; RV32-NEXT: csrr s4, vlenb
+; RV32-NEXT: slli s4, s4, 2
+; RV32-NEXT: mv s7, s4
+; RV32-NEXT: slli s4, s4, 1
+; RV32-NEXT: add s7, s7, s4
+; RV32-NEXT: slli s4, s4, 1
+; RV32-NEXT: add s7, s7, s4
+; RV32-NEXT: slli s4, s4, 2
+; RV32-NEXT: add s4, s4, s7
+; RV32-NEXT: add s4, sp, s4
+; RV32-NEXT: addi s4, s4, 288
+; RV32-NEXT: vs4r.v v16, (s4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v20, (s10), zero
+; RV32-NEXT: vlse64.v v24, (s9), zero
+; RV32-NEXT: vlse64.v v28, (ra), zero
+; RV32-NEXT: vlse64.v v4, (s1), zero
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: slli s1, s1, 2
+; RV32-NEXT: mv s4, s1
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: add s1, s1, s4
+; RV32-NEXT: add s1, sp, s1
+; RV32-NEXT: addi s1, s1, 288
+; RV32-NEXT: vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: slli s1, s1, 3
+; RV32-NEXT: mv s4, s1
+; RV32-NEXT: slli s1, s1, 2
+; RV32-NEXT: add s1, s1, s4
+; RV32-NEXT: add s1, sp, s1
+; RV32-NEXT: addi s1, s1, 288
+; RV32-NEXT: vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v28
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: slli s1, s1, 6
+; RV32-NEXT: add s1, sp, s1
+; RV32-NEXT: addi s1, s1, 288
+; RV32-NEXT: vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v4
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: slli s1, s1, 3
+; RV32-NEXT: mv s4, s1
+; RV32-NEXT: slli s1, s1, 1
+; RV32-NEXT: add s4, s4, s1
+; RV32-NEXT: slli s1, s1, 2
+; RV32-NEXT: add s1, s1, s4
+; RV32-NEXT: add s1, sp, s1
+; RV32-NEXT: addi s1, s1, 288
+; RV32-NEXT: vs4r.v v16, (s1) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v24, (s0), zero
+; RV32-NEXT: vlse64.v v28, (t6), zero
+; RV32-NEXT: vlse64.v v4, (t5), zero
+; RV32-NEXT: vlse64.v v0, (t4), zero
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr t4, vlenb
+; RV32-NEXT: slli t4, t4, 3
+; RV32-NEXT: add t4, sp, t4
+; RV32-NEXT: addi t4, t4, 288
+; RV32-NEXT: vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v28
+; RV32-NEXT: csrr t4, vlenb
+; RV32-NEXT: slli t4, t4, 2
+; RV32-NEXT: mv t5, t4
+; RV32-NEXT: slli t4, t4, 3
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: add t4, sp, t4
+; RV32-NEXT: addi t4, t4, 288
+; RV32-NEXT: vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v4
+; RV32-NEXT: csrr t4, vlenb
+; RV32-NEXT: slli t4, t4, 2
+; RV32-NEXT: mv t5, t4
+; RV32-NEXT: slli t4, t4, 1
+; RV32-NEXT: add t5, t5, t4
+; RV32-NEXT: slli t4, t4, 1
+; RV32-NEXT: add t5, t5, t4
+; RV32-NEXT: slli t4, t4, 1
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: add t4, sp, t4
+; RV32-NEXT: addi t4, t4, 288
+; RV32-NEXT: vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v0
+; RV32-NEXT: csrr t4, vlenb
+; RV32-NEXT: slli t4, t4, 2
+; RV32-NEXT: mv t5, t4
+; RV32-NEXT: slli t4, t4, 2
+; RV32-NEXT: add t5, t5, t4
+; RV32-NEXT: slli t4, t4, 2
+; RV32-NEXT: add t4, t4, t5
+; RV32-NEXT: add t4, sp, t4
+; RV32-NEXT: addi t4, t4, 288
+; RV32-NEXT: vs4r.v v16, (t4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v28, (t3), zero
+; RV32-NEXT: vlse64.v v4, (t2), zero
+; RV32-NEXT: vlse64.v v0, (t1), zero
+; RV32-NEXT: vlse64.v v16, (t0), zero
+; RV32-NEXT: vand.vv v20, v12, v28
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: slli t0, t0, 2
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 288
+; RV32-NEXT: vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v20, v12, v4
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: slli t0, t0, 5
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 288
+; RV32-NEXT: vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v20, v12, v0
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: slli t0, t0, 3
+; RV32-NEXT: mv t1, t0
+; RV32-NEXT: slli t0, t0, 1
+; RV32-NEXT: add t1, t1, t0
+; RV32-NEXT: slli t0, t0, 1
+; RV32-NEXT: add t0, t0, t1
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 288
+; RV32-NEXT: vs4r.v v20, (t0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v16
+; RV32-NEXT: csrr t0, vlenb
+; RV32-NEXT: slli t0, t0, 4
+; RV32-NEXT: mv t1, t0
+; RV32-NEXT: slli t0, t0, 2
+; RV32-NEXT: add t0, t0, t1
+; RV32-NEXT: add t0, sp, t0
+; RV32-NEXT: addi t0, t0, 288
+; RV32-NEXT: vs4r.v v16, (t0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v16, (a7), zero
+; RV32-NEXT: vlse64.v v0, (a6), zero
+; RV32-NEXT: vlse64.v v20, (a5), zero
+; RV32-NEXT: vlse64.v v24, (a4), zero
+; RV32-NEXT: vand.vv v4, v12, v16
+; RV32-NEXT: vand.vv v16, v12, v0
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 2
+; RV32-NEXT: mv a5, a4
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: add a5, a5, a4
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 288
+; RV32-NEXT: vs4r.v v16, (a4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 2
+; RV32-NEXT: mv a5, a4
+; RV32-NEXT: slli a4, a4, 2
+; RV32-NEXT: add a5, a5, a4
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 288
+; RV32-NEXT: vs4r.v v16, (a4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr a4, vlenb
+; RV32-NEXT: slli a4, a4, 2
+; RV32-NEXT: mv a5, a4
+; RV32-NEXT: slli a4, a4, 1
+; RV32-NEXT: add a5, a5, a4
+; RV32-NEXT: slli a4, a4, 3
+; RV32-NEXT: add a4, a4, a5
+; RV32-NEXT: add a4, sp, a4
+; RV32-NEXT: addi a4, a4, 288
+; RV32-NEXT: vs4r.v v16, (a4) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v16, (a3), zero
+; RV32-NEXT: vlse64.v v20, (a2), zero
+; RV32-NEXT: vlse64.v v24, (s3), zero
+; RV32-NEXT: vlse64.v v28, (s2), zero
+; RV32-NEXT: vand.vv v0, v12, v16
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 288
+; RV32-NEXT: vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v24
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 4
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 288
+; RV32-NEXT: vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v28
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: mv a3, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a2, a3
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 288
+; RV32-NEXT: vs4r.v v16, (a2) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vlse64.v v16, (s5), zero
+; RV32-NEXT: vlse64.v v20, (s6), zero
+; RV32-NEXT: vlse64.v v24, (s8), zero
+; RV32-NEXT: vlse64.v v28, (a0), zero
+; RV32-NEXT: vand.vv v16, v12, v16
+; RV32-NEXT: addi a0, sp, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v16, v12, v20
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a2, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vs4r.v v16, (a0) # vscale x 32-byte Folded Spill
+; RV32-NEXT: vand.vv v24, v12, v24
+; RV32-NEXT: vand.vv v20, v12, v28
+; RV32-NEXT: vand.vx v12, v12, a1
+; RV32-NEXT: vmul.vv v12, v8, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vxor.vv v12, v16, v12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 6
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: vmul.vv v16, v8, v4
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: vmul.vv v16, v8, v0
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 4
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: addi a0, sp, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 288
+; RV32-NEXT: vl4r.v v16, (a0) # vscale x 32-byte Folded Reload
+; RV32-NEXT: vmul.vv v16, v8, v16
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: vmul.vv v16, v8, v24
+; RV32-NEXT: vxor.vv v12, v12, v16
+; RV32-NEXT: vmul.vv v8, v8, v20
+; RV32-NEXT: vxor.vv v8, v12, v8
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 3
+; RV32-NEXT: mv a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a1, a1, a0
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, a0, a1
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: lw ra, 348(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 344(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 340(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 336(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 332(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 328(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s5, 324(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s6, 320(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s7, 316(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s8, 312(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s9, 308(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s10, 304(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s11, 300(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 352
+; RV32-NEXT: ret
+;
+; RV64-LABEL: clmul_v8i64:
+; RV64: # %bb.0:
+; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma
+; RV64-NEXT: vand.vi v16, v12, 2
+; RV64-NEXT: vand.vi v20, v12, 1
+; RV64-NEXT: vmul.vv v16, v8, v16
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v20, v16
+; RV64-NEXT: vand.vi v20, v12, 4
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vi v20, v12, 8
+; RV64-NEXT: li a0, 16
+; RV64-NEXT: li a1, 32
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a0
+; RV64-NEXT: li a0, 64
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: li a1, 128
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a0
+; RV64-NEXT: li a0, 256
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: li a1, 512
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a0
+; RV64-NEXT: li a2, 1024
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: li a0, 1
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a2
+; RV64-NEXT: slli a1, a0, 11
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 1
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 2
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 4
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 8
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 16
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 32
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 64
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 128
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 256
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 512
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 1024
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 2048
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 4096
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 8192
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 16384
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 32768
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 65536
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 131072
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: lui a1, 262144
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 31
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 32
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 33
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 34
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 35
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 36
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 37
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 38
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 39
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 40
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 41
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 42
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 43
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 44
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 45
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 46
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 47
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 48
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 49
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 50
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 51
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 52
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 53
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 54
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 55
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 56
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 57
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 58
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 59
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 60
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: slli a1, a0, 61
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a1
+; RV64-NEXT: li a1, -1
+; RV64-NEXT: slli a0, a0, 62
+; RV64-NEXT: slli a1, a1, 63
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vand.vx v20, v12, a0
+; RV64-NEXT: vand.vx v12, v12, a1
+; RV64-NEXT: vmul.vv v20, v8, v20
+; RV64-NEXT: vxor.vv v16, v16, v20
+; RV64-NEXT: vmul.vv v8, v8, v12
+; RV64-NEXT: vxor.vv v8, v16, v8
+; RV64-NEXT: ret
+ %a = call <8 x i64> @llvm.clmul.v8i64(<8 x i64> %x, <8 x i64> %y)
+ ret <8 x i64> %a
+}
+
+define <4 x i8> @clmul_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; CHECK-LABEL: clmul_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
+; CHECK-NEXT: vand.vi v10, v9, 2
+; CHECK-NEXT: vand.vi v11, v9, 1
+; CHECK-NEXT: vmul.vv v10, v8, v10
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v11, v10
+; CHECK-NEXT: vand.vi v11, v9, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vi v11, v9, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vand.vx v9, v9, a0
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vmul.vv v8, v8, v9
+; CHECK-NEXT: vxor.vv v8, v10, v8
+; CHECK-NEXT: ret
+ %res = call <4 x i8> @llvm.clmul.v4i8(<4 x i8> %a, <4 x i8> %b)
+ ret <4 x i8> %res
+}
+
+define <4 x i16> @clmul_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
+; CHECK-LABEL: clmul_v4i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vand.vi v10, v9, 2
+; CHECK-NEXT: vand.vi v11, v9, 1
+; CHECK-NEXT: vmul.vv v10, v8, v10
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v11, v10
+; CHECK-NEXT: vand.vi v11, v9, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vi v11, v9, 8
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vand.vx v11, v9, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vand.vx v9, v9, a0
+; CHECK-NEXT: vmul.vv v11, v8, v11
+; CHECK-NEXT: vxor.vv v10, v10, v11
+; CHECK-NEXT: vmul.vv v8, v8, v9
+; CHECK-NEXT: vxor.vv v8, v10, v8
+; CHECK-NEXT: ret
+ %res = call <4 x i16> @llvm.clmul.v4i16(<4 x i16> %a, <4 x i16> %b)
+ ret <4 x i16> %res
+}
+
+define <4 x i8> @clmulr_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; CHECK-LABEL: clmulr_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vzext.vf2 v10, v8
+; CHECK-NEXT: vzext.vf2 v8, v9
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vand.vi v9, v8, 2
+; CHECK-NEXT: vand.vi v11, v8, 1
+; CHECK-NEXT: vmul.vv v9, v10, v9
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v11, v9
+; CHECK-NEXT: vand.vi v11, v8, 4
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vi v11, v8, 8
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vand.vx v8, v8, a0
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vmul.vv v8, v10, v8
+; CHECK-NEXT: vxor.vv v8, v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 7
+; CHECK-NEXT: ret
+ %a.ext = zext <4 x i8> %a to <4 x i16>
+ %b.ext = zext <4 x i8> %b to <4 x i16>
+ %clmul = call <4 x i16> @llvm.clmul.v4i8(<4 x i16> %a.ext, <4 x i16> %b.ext)
+ %res.ext = lshr <4 x i16> %clmul, splat(i16 7)
+ %res = trunc <4 x i16> %res.ext to <4 x i8>
+ ret <4 x i8> %res
+}
+
+define <4 x i8> @clmulh_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
+; CHECK-LABEL: clmulh_v4i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vzext.vf2 v10, v8
+; CHECK-NEXT: vzext.vf2 v8, v9
+; CHECK-NEXT: li a0, 16
+; CHECK-NEXT: vand.vi v9, v8, 2
+; CHECK-NEXT: vand.vi v11, v8, 1
+; CHECK-NEXT: vmul.vv v9, v10, v9
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v11, v9
+; CHECK-NEXT: vand.vi v11, v8, 4
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vi v11, v8, 8
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 32
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 64
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 128
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 256
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 512
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 1024
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: li a0, 1
+; CHECK-NEXT: slli a0, a0, 11
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: lui a0, 1
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: lui a0, 2
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: lui a0, 4
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vand.vx v11, v8, a0
+; CHECK-NEXT: lui a0, 8
+; CHECK-NEXT: vand.vx v8, v8, a0
+; CHECK-NEXT: vmul.vv v11, v10, v11
+; CHECK-NEXT: vxor.vv v9, v9, v11
+; CHECK-NEXT: vmul.vv v8, v10, v8
+; CHECK-NEXT: vxor.vv v8, v9, v8
+; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vnsrl.wi v8, v8, 8
+; CHECK-NEXT: ret
+ %a.ext = zext <4 x i8> %a to <4 x i16>
+ %b.ext = zext <4 x i8> %b to <4 x i16>
+ %clmul = call <4 x i16> @llvm.clmul.v4i8(<4 x i16> %a.ext, <4 x i16> %b.ext)
+ %res.ext = lshr <4 x i16> %clmul, splat(i16 8)
+ %res = trunc <4 x i16> %res.ext to <4 x i8>
+ ret <4 x i8> %res
+}
More information about the llvm-commits
mailing list