[llvm] [X86] Expand i512 shifts on AVX512 targets (PR #183198)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 3 07:22:12 PST 2026
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/183198
>From 4e50b8757bcbaf3dcb769e11248bfa3450c7a82e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 10 Dec 2025 15:39:11 +0000
Subject: [PATCH] [X86] Expand i512 shifts on AVX512 targets
Cast to vXi64 and use EXPAND/COMPRESS to left/right shift the i64 elements into place, then use FSHL/FSHR to perform the final bitshift
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 110 +
llvm/test/CodeGen/X86/shift-i512.ll | 1895 ++-----
...lar-shift-by-byte-multiple-legalization.ll | 4399 ++++++-----------
3 files changed, 2222 insertions(+), 4182 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8c570aff23f53..25b291b04417f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1888,6 +1888,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::XOR, MVT::i512, Custom);
setOperationAction(ISD::ADD, MVT::i512, Custom);
setOperationAction(ISD::SUB, MVT::i512, Custom);
+ setOperationAction(ISD::SRL, MVT::i512, Custom);
+ setOperationAction(ISD::SHL, MVT::i512, Custom);
+ setOperationAction(ISD::SRA, MVT::i512, Custom);
setOperationAction(ISD::SELECT, MVT::i512, Custom);
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
@@ -2932,6 +2935,10 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
// Check for larger than legal scalar integer ops that might have been
// custom lowered to vector instruction.
switch (Opcode) {
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget);
case ISD::AND:
case ISD::OR:
case ISD::XOR:
@@ -34426,6 +34433,92 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getBitcast(VT, Res));
return;
}
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA: {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ SDValue Amt = N->getOperand(1);
+ assert(Subtarget.useAVX512Regs() && "AVX512F required");
+ assert(VT == MVT::i512 && "Unexpected VT!");
+ MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ MVT BoolVT = VecVT.changeVectorElementType(MVT::i1);
+
+ if (!mayFoldIntoVector(Src, DAG, Subtarget))
+ return;
+
+ // Early out if this will fold to a constant shift of whole byte elements.
+ // TODO: Directly lower to a shuffle?
+ if (auto *AmtC = dyn_cast<ConstantSDNode>(Amt)) {
+ assert(AmtC->getAPIntValue().ult(512) && "Out of bounds shift amount");
+ if (AmtC->getAPIntValue().urem(8) == 0)
+ return;
+ }
+
+ SDValue AmtLane = DAG.getNode(ISD::SRL, dl, MVT::i32,
+ DAG.getZExtOrTrunc(Amt, dl, MVT::i32),
+ DAG.getShiftAmountConstant(6, MVT::i32, dl));
+ AmtLane = DAG.getZExtOrTrunc(AmtLane, dl, MVT::i8);
+
+ if (auto *SrcC = dyn_cast<ConstantSDNode>(Src)) {
+ // Special case: SHL(1,Amt) --> SELECT(1<<(Amt/64), SPLAT(1<<(Amt%64)), 0)
+ if (Opc == ISD::SHL && SrcC->getAPIntValue() == 1) {
+ SDValue Bit = DAG.getConstant(1, dl, MVT::i64);
+ SDValue AmtMod = DAG.getNode(ISD::AND, dl, MVT::i64,
+ DAG.getZExtOrTrunc(Amt, dl, MVT::i64),
+ DAG.getConstant(63, dl, MVT::i64));
+ SDValue LaneMask = DAG.getNode(ISD::SHL, dl, MVT::i64, Bit, AmtLane);
+ LaneMask =
+ DAG.getBitcast(BoolVT, DAG.getZExtOrTrunc(LaneMask, dl, MVT::i8));
+ SDValue Elt = DAG.getNode(ISD::SHL, dl, MVT::i64, Bit, AmtMod);
+ SDValue Res =
+ DAG.getSelect(dl, VecVT, LaneMask, DAG.getSplat(VecVT, dl, Elt),
+ DAG.getConstant(0, dl, VecVT));
+ Results.push_back(DAG.getBitcast(VT, Res));
+ return;
+ }
+ }
+
+ // Use EXPAND/COMPRESS to shuffle the i64 elements left/right with the
+ // ShiftAmt/64 'laneshift', and then shuffle one element along to get the
+ // shifted in bits from the neighbouring element. Finally use a funnel shift
+ // with the ShiftAmt%64 'elementshift' to get the final result.
+ SDValue Mask =
+ DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+ DAG.getNode(ISD::SHL, dl, MVT::i32,
+ DAG.getAllOnesConstant(dl, MVT::i32), AmtLane));
+ Src = DAG.getBitcast(VecVT, Src);
+
+ SDValue PassThrough;
+ if (Opc == ISD::SRA) {
+ // Splat the MSB sign bit across the vector.
+ PassThrough = DAG.getNode(ISD::SRA, dl, VecVT, Src,
+ DAG.getShiftAmountConstant(63, VecVT, dl));
+ PassThrough = DAG.getVectorShuffle(VecVT, dl, PassThrough, PassThrough,
+ {7, 7, 7, 7, 7, 7, 7, 7});
+ } else {
+ PassThrough = DAG.getConstant(0, dl, VecVT);
+ }
+ SDValue A, B;
+ if (Opc == ISD::SHL) {
+ A = DAG.getNode(X86ISD::EXPAND, dl, VecVT, Src, PassThrough,
+ DAG.getBitcast(BoolVT, Mask));
+ B = DAG.getVectorShuffle(VecVT, dl, PassThrough, A,
+ {7, 8, 9, 10, 11, 12, 13, 14});
+ } else {
+ B = DAG.getNode(X86ISD::COMPRESS, dl, VecVT, Src, PassThrough,
+ DAG.getBitcast(BoolVT, Mask));
+ A = DAG.getVectorShuffle(VecVT, dl, B, PassThrough,
+ {1, 2, 3, 4, 5, 6, 7, 8});
+ }
+ // Funnel shifts use modulo shift amount so no need to explicitly mask it.
+ SDValue Res =
+ DAG.getNode(Opc == ISD::SHL ? ISD::FSHL : ISD::FSHR, dl, VecVT, A, B,
+ DAG.getSplatBuildVector(
+ VecVT, dl, DAG.getZExtOrTrunc(Amt, dl, MVT::i64)));
+ Results.push_back(DAG.getBitcast(VT, Res));
+ return;
+ }
case ISD::CTPOP: {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
@@ -48071,6 +48164,23 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
dl, DAG, DCI))
return V;
+ // Scalarize single use funnel shift.
+ // Ideally DAG would handle this similar to scalarizeExtractedBinOp.
+ if (InputVector.getOpcode() == ISD::FSHL ||
+ InputVector.getOpcode() == ISD::FSHR) {
+ if (CIdx && InputVector.hasOneUse() &&
+ TLI.isOperationLegal(InputVector.getOpcode(), VT)) {
+ SDValue LHS = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(0),
+ CIdx->getZExtValue());
+ SDValue RHS = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(1),
+ CIdx->getZExtValue());
+ SDValue Amt = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(2),
+ CIdx->getZExtValue());
+ Amt = DAG.getShiftAmountOperand(VT, Amt);
+ return DAG.getNode(InputVector.getOpcode(), dl, VT, LHS, RHS, Amt);
+ }
+ }
+
// Attempt to extract a i1 element by using MOVMSK to extract the signbits
// and then testing the relevant element.
//
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index f60585e978104..c8ac18d1d309a 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -133,68 +133,42 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
;
; AVX512F-LABEL: shl_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
; AVX512F-NEXT: negl %eax
-; AVX512F-NEXT: movslq %eax, %r8
-; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %r9
-; AVX512F-NEXT: shldq %cl, %rax, %r9
-; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r11
-; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %rbx
-; AVX512F-NEXT: shldq %cl, %rax, %rbx
-; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: shldq %cl, %r10, %r14
-; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
-; AVX512F-NEXT: shldq %cl, %rax, %r10
+; AVX512F-NEXT: cltq
+; AVX512F-NEXT: vmovdqu64 -64(%rsp,%rax), %zmm2
+; AVX512F-NEXT: vpsllq %xmm1, %zmm2, %zmm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[7],zmm2[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
-; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdi, %rdx
-; AVX512F-NEXT: movq %r10, 56(%rax)
-; AVX512F-NEXT: movq %r14, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 40(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %rsi, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %r8, (%rax)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shl_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: pushq %rax
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
@@ -210,53 +184,31 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
; AVX512VL-NEXT: shrl $3, %eax
; AVX512VL-NEXT: andl $56, %eax
; AVX512VL-NEXT: negl %eax
-; AVX512VL-NEXT: movslq %eax, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VL-NEXT: movq %rax, %rsi
-; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VL-NEXT: movq %r10, %r8
-; AVX512VL-NEXT: shldq %cl, %rax, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq %r11, %rbx
-; AVX512VL-NEXT: shldq %cl, %r10, %rbx
+; AVX512VL-NEXT: cltq
+; AVX512VL-NEXT: vmovdqu64 -64(%rsp,%rax), %zmm1
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm3[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VL-NEXT: movq %rdi, %r10
-; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VL-NEXT: movq %r14, %r15
-; AVX512VL-NEXT: shldq %cl, %rdi, %r15
-; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VL-NEXT: shldq %cl, %r14, %rdi
-; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %r11, %rdx
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
-; AVX512VL-NEXT: movq %r15, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %rsi, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %r9, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: popq %rcx
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: shl_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
@@ -265,50 +217,23 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %eax, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %eax
-; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: negl %eax
-; AVX512VBMI-NEXT: movslq %eax, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VBMI-NEXT: movq %rax, %rsi
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VBMI-NEXT: movq %r10, %r8
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq %r11, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: movq %rdi, %r10
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VBMI-NEXT: movq %r14, %r15
-; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
-; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT: movq %r15, 48(%rax)
-; AVX512VBMI-NEXT: movq %r10, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT: movq %r8, 24(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
-; AVX512VBMI-NEXT: movq %r9, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT: movl %edi, %ecx
+; AVX512VBMI-NEXT: shrl $3, %ecx
+; AVX512VBMI-NEXT: andl $56, %ecx
+; AVX512VBMI-NEXT: negl %ecx
+; AVX512VBMI-NEXT: movslq %ecx, %rcx
+; AVX512VBMI-NEXT: vmovdqu64 -64(%rsp,%rcx), %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm1, 32(%rax)
+; AVX512VBMI-NEXT: vmovdqu %ymm1, (%rax)
+; AVX512VBMI-NEXT: popq %rcx
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = shl i512 %a0, %a1
@@ -428,16 +353,14 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
;
; AVX512F-LABEL: lshr_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: pushq %rax
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -445,44 +368,25 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512F-NEXT: movq %r9, %rsi
-; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512F-NEXT: shrdq %cl, %r8, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r8
-; AVX512F-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rax), %r15
-; AVX512F-NEXT: shrdq %cl, %r15, %rbx
-; AVX512F-NEXT: shrdq %cl, %r9, %r14
+; AVX512F-NEXT: vmovdqu64 -128(%rsp,%rax), %zmm2
+; AVX512F-NEXT: vpsrlq %xmm0, %zmm2, %zmm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: shrxq %rcx, %r15, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r8, 24(%rdi)
-; AVX512F-NEXT: movq %rdx, 16(%rdi)
-; AVX512F-NEXT: movq %rsi, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: pushq %rax
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
@@ -498,91 +402,52 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
; AVX512VL-NEXT: shrl $3, %eax
; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512VL-NEXT: movq %r9, %rsi
-; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512VL-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r8
-; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15
-; AVX512VL-NEXT: shrdq %cl, %r9, %r15
+; AVX512VL-NEXT: vmovdqu64 -128(%rsp,%rax), %zmm1
+; AVX512VL-NEXT: vpsrlq %xmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r8, 24(%rdi)
-; AVX512VL-NEXT: movq %rdx, 16(%rdi)
-; AVX512VL-NEXT: movq %rsi, 8(%rdi)
-; AVX512VL-NEXT: movq %r15, (%rdi)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: popq %rcx
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %eax, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %eax
-; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512VBMI-NEXT: movq %r9, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r15, (%rdi)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX512VBMI-NEXT: shrl $3, %edi
+; AVX512VBMI-NEXT: andl $56, %edi
+; AVX512VBMI-NEXT: vmovdqu64 -128(%rsp,%rdi), %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm1, 32(%rax)
+; AVX512VBMI-NEXT: vmovdqu %ymm1, (%rax)
+; AVX512VBMI-NEXT: popq %rcx
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = lshr i512 %a0, %a1
@@ -711,14 +576,12 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
;
; AVX512F-LABEL: ashr_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -735,49 +598,32 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512F-NEXT: movq %r9, %rsi
-; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512F-NEXT: shrdq %cl, %r8, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r8
-; AVX512F-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rax), %r15
-; AVX512F-NEXT: shrdq %cl, %r15, %rbx
-; AVX512F-NEXT: shrdq %cl, %r9, %r14
+; AVX512F-NEXT: vmovdqu64 -128(%rsp,%rax), %zmm1
+; AVX512F-NEXT: vpsrlq %xmm0, %zmm1, %zmm2
+; AVX512F-NEXT: vpsraq $63, -72(%rsp,%rax){1to8}, %zmm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm3[7,0,1,2,3,4,5,6]
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: sarxq %rcx, %r15, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r8, 24(%rdi)
-; AVX512F-NEXT: movq %rdx, 16(%rdi)
-; AVX512F-NEXT: movq %rsi, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -793,97 +639,62 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %eax
-; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512VL-NEXT: movq %r9, %rsi
-; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512VL-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r8
-; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15
-; AVX512VL-NEXT: shrdq %cl, %r9, %r15
+; AVX512VL-NEXT: shrl $3, %ecx
+; AVX512VL-NEXT: andl $56, %ecx
+; AVX512VL-NEXT: vpsraq $63, -72(%rsp,%rcx){1to8}, %zmm0
+; AVX512VL-NEXT: vmovdqu64 -128(%rsp,%rcx), %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,15]
+; AVX512VL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm2, %zmm0
+; AVX512VL-NEXT: andl $63, %eax
+; AVX512VL-NEXT: vpbroadcastq %rax, %xmm2
+; AVX512VL-NEXT: vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm3
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm1, %zmm1
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r8, 24(%rdi)
-; AVX512VL-NEXT: movq %rdx, 16(%rdi)
-; AVX512VL-NEXT: movq %rsi, 8(%rdi)
-; AVX512VL-NEXT: movq %r15, (%rdi)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: sarq $63, %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %eax, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %eax
-; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512VBMI-NEXT: movq %r9, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r15, (%rdi)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: sarq $63, %rdi
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vpbroadcastq %r10, %zmm0
+; AVX512VBMI-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX512VBMI-NEXT: shrl $3, %r10d
+; AVX512VBMI-NEXT: andl $56, %r10d
+; AVX512VBMI-NEXT: vpsraq $63, -72(%rsp,%r10){1to8}, %zmm1
+; AVX512VBMI-NEXT: vmovdqu64 -128(%rsp,%r10), %zmm2
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,2,3,4,5,6,7,15]
+; AVX512VBMI-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; AVX512VBMI-NEXT: vpshrdvq %zmm0, %zmm3, %zmm2
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm2, 32(%rax)
+; AVX512VBMI-NEXT: vmovdqu %ymm2, (%rax)
+; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = ashr i512 %a0, %a1
ret i512 %r
@@ -1008,162 +819,64 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: shl_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rsi), %zmm0
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %edx, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: negl %edx
-; AVX512F-NEXT: movslq %edx, %r8
-; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %r9
-; AVX512F-NEXT: shldq %cl, %rax, %r9
-; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r11
-; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %rbx
-; AVX512F-NEXT: shldq %cl, %rax, %rbx
-; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: shldq %cl, %r10, %r14
-; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
-; AVX512F-NEXT: shldq %cl, %rax, %r10
+; AVX512F-NEXT: movl %edx, %eax
+; AVX512F-NEXT: vmovq %rax, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: shlxl %edx, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpexpandq (%rsi), %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512F-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
-; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdi, %rdx
-; AVX512F-NEXT: movq %r10, 56(%rax)
-; AVX512F-NEXT: movq %r14, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 40(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %rsi, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %r8, (%rax)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shl_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vmovups (%rsi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %edx, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: negl %edx
-; AVX512VL-NEXT: movslq %edx, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VL-NEXT: movq %rax, %rsi
-; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VL-NEXT: movq %r10, %r8
-; AVX512VL-NEXT: shldq %cl, %rax, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq %r11, %rbx
-; AVX512VL-NEXT: shldq %cl, %r10, %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VL-NEXT: movq %rdi, %r10
-; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VL-NEXT: movq %r14, %r15
-; AVX512VL-NEXT: shldq %cl, %rdi, %r15
-; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VL-NEXT: shldq %cl, %r14, %rdi
-; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %r11, %rdx
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
-; AVX512VL-NEXT: movq %r15, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %rsi, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %r9, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: movl %edx, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpexpandq (%rsi), %zmm2 {%k1} {z}
+; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm3
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm2, %zmm3
+; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: shl_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %edx, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: negl %edx
-; AVX512VBMI-NEXT: movslq %edx, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VBMI-NEXT: movq %rax, %rsi
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VBMI-NEXT: movq %r10, %r8
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq %r11, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: movq %rdi, %r10
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VBMI-NEXT: movq %r14, %r15
-; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
-; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT: movq %r15, 48(%rax)
-; AVX512VBMI-NEXT: movq %r10, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT: movq %r8, 24(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
-; AVX512VBMI-NEXT: movq %r9, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: movl %edx, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edi
+; AVX512VBMI-NEXT: shlxl %ecx, %edi, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm0[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -1276,141 +989,67 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: lshr_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rsi), %zmm0
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %edx, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512F-NEXT: movq %rax, %r8
-; AVX512F-NEXT: shrdq %cl, %rsi, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512F-NEXT: shrdq %cl, %r9, %rsi
-; AVX512F-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rdx), %rdx
-; AVX512F-NEXT: shrdq %cl, %rdx, %rbx
-; AVX512F-NEXT: shrdq %cl, %rax, %r14
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: movl %edx, %eax
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: shlxl %edx, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllq %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: shrxq %rcx, %rdx, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r9, 24(%rdi)
-; AVX512F-NEXT: movq %rsi, 16(%rdi)
-; AVX512F-NEXT: movq %r8, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rsi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %edx, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512VL-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VL-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rdx), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rdx
-; AVX512VL-NEXT: shrdq %cl, %rax, %rdx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r9, 24(%rdi)
-; AVX512VL-NEXT: movq %rsi, 16(%rdi)
-; AVX512VL-NEXT: movq %r8, 8(%rdi)
-; AVX512VL-NEXT: movq %rdx, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: movl %edx, %ecx
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm2
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpand %xmm1, %xmm2, %xmm3
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpandn %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %edx, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VBMI-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rdx), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %rdx
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: movl %edx, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %esi
+; AVX512VBMI-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -1539,174 +1178,76 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: ashr_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rsi), %ymm0
-; AVX512F-NEXT: vmovaps 32(%rsi), %xmm1
-; AVX512F-NEXT: movq 48(%rsi), %rax
-; AVX512F-NEXT: movq 56(%rsi), %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: sarq $63, %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %edx, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512F-NEXT: movq %rax, %r8
-; AVX512F-NEXT: shrdq %cl, %rsi, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512F-NEXT: shrdq %cl, %r9, %rsi
-; AVX512F-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rdx), %rdx
-; AVX512F-NEXT: shrdq %cl, %rdx, %rbx
-; AVX512F-NEXT: shrdq %cl, %rax, %r14
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: movl %edx, %eax
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT: vmovq %rax, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm4
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: sarxq %rcx, %rdx, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r9, 24(%rdi)
-; AVX512F-NEXT: movq %rsi, 16(%rdi)
-; AVX512F-NEXT: movq %r8, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rsi), %ymm0
-; AVX512VL-NEXT: vmovaps 32(%rsi), %xmm1
-; AVX512VL-NEXT: movq 48(%rsi), %rax
-; AVX512VL-NEXT: movq 56(%rsi), %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: sarq $63, %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %edx, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512VL-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VL-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rdx), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rdx
-; AVX512VL-NEXT: shrdq %cl, %rax, %rdx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r9, 24(%rdi)
-; AVX512VL-NEXT: movq %rsi, 16(%rdi)
-; AVX512VL-NEXT: movq %r8, 8(%rdi)
-; AVX512VL-NEXT: movq %rdx, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: movl %edx, %ecx
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: movl $-1, %esi
+; AVX512VL-NEXT: shlxl %edx, %esi, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT: vpand %xmm0, %xmm3, %xmm4
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vporq %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT: vmovaps 32(%rsi), %xmm1
-; AVX512VBMI-NEXT: movq 48(%rsi), %rax
-; AVX512VBMI-NEXT: movq 56(%rsi), %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: sarq $63, %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %edx, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VBMI-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rdx), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %rdx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
+; AVX512VBMI-NEXT: vpsraq $63, %zmm0, %zmm2
+; AVX512VBMI-NEXT: vpermq %zmm2, %zmm1, %zmm1
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm2
+; AVX512VBMI-NEXT: movl %edx, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm3 {%k1}
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm0 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm2, %zmm0, %zmm3
+; AVX512VBMI-NEXT: vmovdqu64 %zmm3, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -2223,160 +1764,39 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
;
; AVX512F-LABEL: shl_1_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %esi, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %esi
-; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: negl %esi
-; AVX512F-NEXT: movslq %esi, %r8
-; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %r9
-; AVX512F-NEXT: shldq %cl, %rax, %r9
-; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r11
-; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %rbx
-; AVX512F-NEXT: shldq %cl, %rax, %rbx
-; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: shldq %cl, %r10, %r14
-; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
-; AVX512F-NEXT: shldq %cl, %rax, %r10
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
-; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdi, %rdx
-; AVX512F-NEXT: movq %r10, 56(%rax)
-; AVX512F-NEXT: movq %r14, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 40(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %rsi, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %r8, (%rax)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: movl $1, %ecx
+; AVX512F-NEXT: shlxq %rsi, %rcx, %rdx
+; AVX512F-NEXT: shrl $6, %esi
+; AVX512F-NEXT: shlxq %rsi, %rcx, %rcx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpbroadcastq %rdx, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shl_1_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %esi
-; AVX512VL-NEXT: andl $56, %esi
-; AVX512VL-NEXT: negl %esi
-; AVX512VL-NEXT: movslq %esi, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VL-NEXT: movq %rax, %rsi
-; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VL-NEXT: movq %r10, %r8
-; AVX512VL-NEXT: shldq %cl, %rax, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq %r11, %rbx
-; AVX512VL-NEXT: shldq %cl, %r10, %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VL-NEXT: movq %rdi, %r10
-; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VL-NEXT: movq %r14, %r15
-; AVX512VL-NEXT: shldq %cl, %rdi, %r15
-; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VL-NEXT: shldq %cl, %r14, %rdi
-; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %r11, %rdx
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
-; AVX512VL-NEXT: movq %r15, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %rsi, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %r9, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: movl $1, %ecx
+; AVX512VL-NEXT: shlxq %rsi, %rcx, %rdx
+; AVX512VL-NEXT: shrl $6, %esi
+; AVX512VL-NEXT: shlxq %rsi, %rcx, %rcx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpbroadcastq %rdx, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: shl_1_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %esi, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: negl %esi
-; AVX512VBMI-NEXT: movslq %esi, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VBMI-NEXT: movq %rax, %rsi
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VBMI-NEXT: movq %r10, %r8
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq %r11, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: movq %rdi, %r10
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VBMI-NEXT: movq %r14, %r15
-; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
-; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT: movq %r15, 48(%rax)
-; AVX512VBMI-NEXT: movq %r10, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT: movq %r8, 24(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
-; AVX512VBMI-NEXT: movq %r9, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: movl $1, %ecx
+; AVX512VBMI-NEXT: shlxq %rsi, %rcx, %rdx
+; AVX512VBMI-NEXT: shrl $6, %esi
+; AVX512VBMI-NEXT: shlxq %rsi, %rcx, %rcx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = shl i512 1, %a0
@@ -2485,139 +1905,67 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
;
; AVX512F-LABEL: lshr_signbit_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %esi, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %esi
-; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512F-NEXT: movq %rax, %r8
-; AVX512F-NEXT: shrdq %cl, %rdx, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512F-NEXT: shrdq %cl, %r9, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi
-; AVX512F-NEXT: shrdq %cl, %rsi, %rbx
-; AVX512F-NEXT: shrdq %cl, %rax, %r14
+; AVX512F-NEXT: movl %esi, %eax
+; AVX512F-NEXT: vmovq %rax, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: shrl $6, %esi
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: shlxl %esi, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: shrxq %rcx, %rsi, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r9, 24(%rdi)
-; AVX512F-NEXT: movq %rdx, 16(%rdi)
-; AVX512F-NEXT: movq %r8, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_signbit_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %esi
-; AVX512VL-NEXT: andl $56, %esi
-; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi
-; AVX512VL-NEXT: shrdq %cl, %rax, %rsi
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r9, 24(%rdi)
-; AVX512VL-NEXT: movq %rdx, 16(%rdi)
-; AVX512VL-NEXT: movq %r8, 8(%rdi)
-; AVX512VL-NEXT: movq %rsi, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: shrl $6, %esi
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %esi, %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm2 {%k1} {z}
+; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm3
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm2, %zmm3
+; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_signbit_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %esi, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%s = shl i512 1, 511
@@ -2732,141 +2080,70 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
;
; AVX512F-LABEL: ashr_signbit_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: movl %esi, %eax
+; AVX512F-NEXT: shrl $6, %esi
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %esi, %ecx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = -1
-; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %esi, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %esi
-; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512F-NEXT: movq %rax, %r8
-; AVX512F-NEXT: shrdq %cl, %rdx, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512F-NEXT: shrdq %cl, %r9, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi
-; AVX512F-NEXT: shrdq %cl, %rsi, %rbx
-; AVX512F-NEXT: shrdq %cl, %rax, %r14
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm2 {%k1}
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm4
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512F-NEXT: vpandn %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm2[1,2,3,4,5,6,7],zmm0[0]
+; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllq %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: sarxq %rcx, %rsi, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r9, 24(%rdi)
-; AVX512F-NEXT: movq %rdx, 16(%rdi)
-; AVX512F-NEXT: movq %r8, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_signbit_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %esi
-; AVX512VL-NEXT: andl $56, %esi
-; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi
-; AVX512VL-NEXT: shrdq %cl, %rax, %rsi
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r9, 24(%rdi)
-; AVX512VL-NEXT: movq %rdx, 16(%rdi)
-; AVX512VL-NEXT: movq %r8, 8(%rdi)
-; AVX512VL-NEXT: movq %rsi, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: shrl $6, %esi
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %esi, %edx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT: vpand %xmm0, %xmm3, %xmm4
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512VL-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vporq %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_signbit_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %esi, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm0
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%s = shl i512 1, 511
@@ -3099,63 +2376,51 @@ define i64 @lshr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: lshr_extract_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: vmovups (%rdi), %zmm0
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %ecx, %edx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: movl %ecx, %eax
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %eax, %edx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512F-NEXT: shrdq %cl, %rdx, %rax
-; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_extract_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1
; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %ecx, %edx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %eax, %edx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VL-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT: popq %rcx
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_extract_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rdi), %ymm1
; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %ecx, %edx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VBMI-NEXT: movl %ecx, %eax
+; AVX512VBMI-NEXT: shrl $6, %eax
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %eax, %edx, %eax
+; AVX512VBMI-NEXT: kmovd %eax, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VBMI-NEXT: vmovq %xmm0, %rax
; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT: popq %rcx
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -3232,96 +2497,60 @@ define i64 @ashr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: ashr_extract_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: vmovups (%rdi), %ymm0
-; AVX512F-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512F-NEXT: movq 48(%rdi), %rax
-; AVX512F-NEXT: movq 56(%rdi), %rdx
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: sarq $63, %rdx
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %ecx, %edx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: movl %ecx, %eax
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %eax, %edx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512F-NEXT: vmovq %xmm1, %rax
; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512F-NEXT: shrdq %cl, %rdx, %rax
-; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_extract_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512VL-NEXT: movq 48(%rdi), %rax
-; AVX512VL-NEXT: movq 56(%rdi), %rdx
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: sarq $63, %rdx
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %ecx, %edx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VL-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %eax, %edx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512VL-NEXT: vmovq %xmm1, %rax
; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VL-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT: popq %rcx
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_extract_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512VBMI-NEXT: movq 48(%rdi), %rax
-; AVX512VBMI-NEXT: movq 56(%rdi), %rdx
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: sarq $63, %rdx
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %ecx, %edx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VBMI-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512VBMI-NEXT: movl %ecx, %eax
+; AVX512VBMI-NEXT: shrl $6, %eax
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %eax, %edx, %eax
+; AVX512VBMI-NEXT: kmovd %eax, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512VBMI-NEXT: vmovq %xmm1, %rax
; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT: popq %rcx
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -3370,45 +2599,33 @@ define i64 @lshr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: lshr_extract_idx_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rdi), %zmm0
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: andl $7, %esi
-; AVX512F-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512F-NEXT: popq %rcx
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: shlxl %esi, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_extract_idx_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: andl $7, %esi
-; AVX512VL-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: movl $-1, %eax
+; AVX512VL-NEXT: shlxl %esi, %eax, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_extract_idx_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: andl $7, %esi
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VBMI-NEXT: movl $-1, %eax
+; AVX512VBMI-NEXT: shlxl %esi, %eax, %eax
+; AVX512VBMI-NEXT: kmovd %eax, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vmovq %xmm0, %rax
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -3474,78 +2691,42 @@ define i64 @ashr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: ashr_extract_idx_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rdi), %ymm0
-; AVX512F-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512F-NEXT: movq 48(%rdi), %rax
-; AVX512F-NEXT: movq 56(%rdi), %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: sarq $63, %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: andl $7, %esi
-; AVX512F-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512F-NEXT: popq %rcx
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: shlxl %esi, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovq %xmm1, %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_extract_idx_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512VL-NEXT: movq 48(%rdi), %rax
-; AVX512VL-NEXT: movq 56(%rdi), %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: sarq $63, %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: andl $7, %esi
-; AVX512VL-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: movl $-1, %eax
+; AVX512VL-NEXT: shlxl %esi, %eax, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovq %xmm1, %rax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_extract_idx_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512VBMI-NEXT: movq 48(%rdi), %rax
-; AVX512VBMI-NEXT: movq 56(%rdi), %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: sarq $63, %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: andl $7, %esi
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VBMI-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VBMI-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512VBMI-NEXT: movl $-1, %eax
+; AVX512VBMI-NEXT: shlxl %esi, %eax, %eax
+; AVX512VBMI-NEXT: kmovd %eax, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vmovq %xmm1, %rax
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 65b602801b365..e9ddc576c6cd8 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -11,10 +11,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-BMI2-AVX512,X64-NO-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-NO-BMI2-AVX512,X64-HAVE-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-HAVE-BMI2-AVX512,X64-NO-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-BMI2-AVX512,X64-HAVE-SHLD-HAVE-BMI2-AVX512
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-SSE2
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-SSE2
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-SSE2
@@ -27,10 +27,10 @@
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX1
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX1
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX1
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX512
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-BMI2-AVX512,X86-NO-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-NO-BMI2-AVX512,X86-HAVE-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-HAVE-BMI2-AVX512,X86-NO-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-BMI2-AVX512,X86-HAVE-SHLD-HAVE-BMI2-AVX512
define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: lshr_4bytes:
@@ -10617,262 +10617,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq
;
-; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %r9d
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%r9,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%r9), %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r8,%r8), %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%r9), %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %r10, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%r9), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%rbp,%rbp), %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %r14, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%r9), %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r9,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbp, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r8, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, 56(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, 48(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, 32(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 40(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq $8, %rsp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq
-;
-; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %edi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rdi,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %edi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%rdi), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%rdi), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%rdi), %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%rdi), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%rdi), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%rdi), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%rdi), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%rdi), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 56(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 32(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 24(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq
-;
-; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rsi), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rsi), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rsi), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r9, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rsi), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r14, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rsi), %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%rsi,%rsi), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r15, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r9, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rsi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rcx, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r13, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, (%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq
-;
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rax), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rax), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rax, %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 32(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 24(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 56(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq
+; X64-NO-BMI2-AVX512-LABEL: lshr_64bytes:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT: movl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT: leal (,%rcx,8), %eax
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq %rax, %xmm1
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; X64-NO-BMI2-AVX512-NEXT: vpand %xmm2, %xmm1, %xmm3
+; X64-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; X64-NO-BMI2-AVX512-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; X64-NO-BMI2-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X64-NO-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vporq %zmm3, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq %rcx, %xmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; X64-HAVE-BMI2-AVX512-NEXT: vpand %xmm2, %xmm1, %xmm3
+; X64-HAVE-BMI2-AVX512-NEXT: shrl $3, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; X64-HAVE-BMI2-AVX512-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X64-HAVE-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vporq %zmm3, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
;
; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
@@ -12810,563 +12602,60 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl
;
-; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 68(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 72(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %cl, %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edi,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %eax, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edi,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edx,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%ebx,%ebx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: retl
-;
-; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: addl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retl
-;
-; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%ecx,8), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 68(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 72(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 80(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 76(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 88(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 84(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 96(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 92(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 104(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 100(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 112(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 108(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 120(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 116(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 124(%esp,%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 60(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 56(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 48(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 52(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, 40(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, (%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 4(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retl
-;
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shll $3, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: addl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retl
+; X86-NO-BMI2-AVX512-LABEL: lshr_64bytes:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 (%ecx), %zmm0
+; X86-NO-BMI2-AVX512-NEXT: movl (%edx), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NO-BMI2-AVX512-NEXT: vpslld $3, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,0,63,0]
+; X86-NO-BMI2-AVX512-NEXT: vpand %xmm2, %xmm1, %xmm3
+; X86-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-BMI2-AVX512-NEXT: kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; X86-NO-BMI2-AVX512-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X86-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X86-NO-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vporq %zmm3, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: movl (%ecx), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-HAVE-BMI2-AVX512-NEXT: vpslld $3, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,0,63,0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpand %xmm2, %xmm1, %xmm3
+; X86-HAVE-BMI2-AVX512-NEXT: shrl $3, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; X86-HAVE-BMI2-AVX512-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X86-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vporq %zmm3, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -13474,26 +12763,29 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
-; X64-AVX512-LABEL: lshr_64bytes_qwordOff:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: pushq %rax
-; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-AVX512-NEXT: movl (%rsi), %eax
-; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: andl $7, %eax
-; X64-AVX512-NEXT: vmovups -128(%rsp,%rax,8), %xmm0
-; X64-AVX512-NEXT: vmovups -112(%rsp,%rax,8), %xmm1
-; X64-AVX512-NEXT: vmovups -96(%rsp,%rax,8), %xmm2
-; X64-AVX512-NEXT: vmovups -80(%rsp,%rax,8), %xmm3
-; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT: popq %rax
-; X64-AVX512-NEXT: vzeroupper
-; X64-AVX512-NEXT: retq
+; X64-NO-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
;
; X86-SSE2-LABEL: lshr_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
@@ -13682,29 +12974,35 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
-; X86-AVX512-LABEL: lshr_64bytes_qwordOff:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: subl $140, %esp
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: vmovups (%edx), %zmm0
-; X86-AVX512-NEXT: movl (%ecx), %ecx
-; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT: vmovups %zmm0, (%esp)
-; X86-AVX512-NEXT: andl $7, %ecx
-; X86-AVX512-NEXT: vmovups (%esp,%ecx,8), %xmm0
-; X86-AVX512-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
-; X86-AVX512-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
-; X86-AVX512-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
-; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT: addl $140, %esp
-; X86-AVX512-NEXT: vzeroupper
-; X86-AVX512-NEXT: retl
+; X86-NO-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm0
+; X86-NO-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-BMI2-AVX512-NEXT: kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%qwordOff = load i512, ptr %qwordOff.ptr, align 1
%bitOff = shl i512 %qwordOff, 6
@@ -14606,267 +13904,52 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq
;
-; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rcx,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movslq %ecx, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r9), %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r9), %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r9), %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r11, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r9), %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r15, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r9), %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r9, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, 56(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, 48(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 8(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 16(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 32(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 40(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq
-;
-; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movslq %eax, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 56(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 48(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, 24(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 32(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, (%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq
-;
-; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rdi, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r8, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r9, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%rsi), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r10, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r11, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%rsi), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rbx, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%rsi), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r15, %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r12, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r15, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rcx, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 48(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq $8, %rsp
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq
-;
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %eax, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r11, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 56(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 16(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 24(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 32(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, (%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq
+; X64-NO-BMI2-AVX512-LABEL: shl_64bytes:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: movl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT: leal (,%rcx,8), %eax
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq %rax, %xmm0
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; X64-NO-BMI2-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X64-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT: vpexpandq (%rdi), %zmm3 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; X64-NO-BMI2-AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; X64-NO-BMI2-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X64-NO-BMI2-AVX512-NEXT: vpsrlq $1, %zmm1, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq %rcx, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; X64-HAVE-BMI2-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X64-HAVE-BMI2-AVX512-NEXT: shrl $3, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vpexpandq (%rdi), %zmm3 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; X64-HAVE-BMI2-AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X64-HAVE-BMI2-AVX512-NEXT: vpsrlq $1, %zmm1, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
;
; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
@@ -16892,596 +15975,62 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl
;
-; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 4(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 12(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 8(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 20(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 16(%edi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 28(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 24(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 36(%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 32(%edx), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 44(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 40(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 176(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%edi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: retl
-;
-; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl %ebp, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 8(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 12(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 4(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 16(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 20(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 24(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 28(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 32(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 36(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 40(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 44(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 160(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: addl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retl
-;
-; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%eax,8), %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl %eax, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 4(%edx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %bl
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 8(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 12(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 16(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 20(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 24(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 28(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 32(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 36(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 40(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 44(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 48(%edx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ecx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 56(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 60(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 48(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 52(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 40(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retl
-;
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%ebx,8), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl %ebx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 4(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 8(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 12(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 16(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 20(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 24(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 28(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 32(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 36(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 40(%eax), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 44(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %ebp, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 60(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 176(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 56(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %ebx, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 48(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 52(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, 40(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 44(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 32(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 36(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, (%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retl
+; X86-NO-BMI2-AVX512-LABEL: shl_64bytes:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: pushl %esi
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: movl (%esi), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NO-BMI2-AVX512-NEXT: vpslld $3, %xmm0, %xmm0
+; X86-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,0,63,0]
+; X86-NO-BMI2-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X86-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-BMI2-AVX512-NEXT: kmovw %esi, %k1
+; X86-NO-BMI2-AVX512-NEXT: vpexpandq (%edx), %zmm3 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; X86-NO-BMI2-AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; X86-NO-BMI2-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X86-NO-BMI2-AVX512-NEXT: vpsrlq $1, %zmm1, %zmm1
+; X86-NO-BMI2-AVX512-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: popl %esi
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: pushl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-HAVE-BMI2-AVX512-NEXT: vpslld $3, %xmm0, %xmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,0,63,0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X86-HAVE-BMI2-AVX512-NEXT: shrl $3, %esi
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %edx, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %edx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vpexpandq (%ecx), %zmm3 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; X86-HAVE-BMI2-AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X86-HAVE-BMI2-AVX512-NEXT: vpsrlq $1, %zmm1, %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: popl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -17598,29 +16147,27 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
-; X64-AVX512-LABEL: shl_64bytes_qwordOff:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: pushq %rax
-; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-AVX512-NEXT: movl (%rsi), %eax
-; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: shll $3, %eax
-; X64-AVX512-NEXT: andl $56, %eax
-; X64-AVX512-NEXT: negl %eax
-; X64-AVX512-NEXT: cltq
-; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT: vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT: vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT: vmovups -16(%rsp,%rax), %xmm3
-; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT: popq %rax
-; X64-AVX512-NEXT: vzeroupper
-; X64-AVX512-NEXT: retq
+; X64-NO-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT: vpexpandq (%rdi), %zmm0 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vpexpandq (%rdi), %zmm0 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
;
; X86-SSE2-LABEL: shl_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
@@ -17821,33 +16368,37 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
-; X86-AVX512-LABEL: shl_64bytes_qwordOff:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: subl $140, %esp
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: vmovups (%edx), %zmm0
-; X86-AVX512-NEXT: movl (%ecx), %ecx
-; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT: vmovups %zmm1, (%esp)
-; X86-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT: shll $3, %ecx
-; X86-AVX512-NEXT: andl $56, %ecx
-; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: subl %ecx, %edx
-; X86-AVX512-NEXT: vmovups (%edx), %xmm0
-; X86-AVX512-NEXT: vmovups 16(%edx), %xmm1
-; X86-AVX512-NEXT: vmovups 32(%edx), %xmm2
-; X86-AVX512-NEXT: negl %ecx
-; X86-AVX512-NEXT: vmovups 112(%esp,%ecx), %xmm3
-; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT: addl $140, %esp
-; X86-AVX512-NEXT: vzeroupper
-; X86-AVX512-NEXT: retl
+; X86-NO-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: pushl %esi
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-BMI2-AVX512-NEXT: kmovw %esi, %k1
+; X86-NO-BMI2-AVX512-NEXT: vpexpandq (%edx), %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: popl %esi
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: pushl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %esi
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vpexpandq (%edx), %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: popl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%qwordOff = load i512, ptr %qwordOff.ptr, align 1
%bitOff = shl i512 %qwordOff, 6
@@ -18501,321 +17052,378 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
-; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rcx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %edi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rcx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rdi,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rdi), %r9
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rdi), %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r12,%r12), %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rdi), %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rdi), %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rdi), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rbp,%rbp), %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r13, %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r14, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r12, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rdi), %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rdi,%rdi), %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbp, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 56(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 8(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r12, 48(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, 32(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r15, 40(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 16(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 24(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq $8, %rsp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
-;
-; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 56(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 32(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r14, (%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
+; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-NO-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 48(%rdi), %rax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 56(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %edi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: sarq $63, %rcx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rdi,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %edi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r9,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r10, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r14, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%rdi), %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r13, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %r14, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r12, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%rdi,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r9, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: sarq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: retq
;
-; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r14, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r15, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rsi, %rax, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 48(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r13, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, (%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
+; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 48(%rdi), %rcx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: sarq %cl, %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retq
;
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %r11, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 32(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r14, (%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 56(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
+; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 48(%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 56(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r10,%r10), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r14,%r14), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %rbx, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r13, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r15, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %r12, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%rax,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r9, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r10, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: sarxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retq
;
-; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
-; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 48(%rdi), %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: sarxq %rcx, %r11, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq
+;
+; X64-NO-BMI2-AVX512-LABEL: ashr_64bytes:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-NO-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-NO-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %esi
+; X64-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X64-NO-BMI2-AVX512-NEXT: kmovw %esi, %k1
+; X64-NO-BMI2-AVX512-NEXT: vmovdqa64 %zmm1, %zmm2
+; X64-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
+; X64-NO-BMI2-AVX512-NEXT: shll $3, %eax
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq %rax, %xmm0
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; X64-NO-BMI2-AVX512-NEXT: vpand %xmm3, %xmm0, %xmm4
+; X64-NO-BMI2-AVX512-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; X64-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; X64-NO-BMI2-AVX512-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; X64-NO-BMI2-AVX512-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vporq %zmm4, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: ashr_64bytes:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-HAVE-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %esi
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %esi, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %ecx, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqa64 %zmm1, %zmm2
+; X64-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
+; X64-HAVE-BMI2-AVX512-NEXT: shll $3, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq %rax, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; X64-HAVE-BMI2-AVX512-NEXT: vpand %xmm3, %xmm0, %xmm4
+; X64-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; X64-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; X64-HAVE-BMI2-AVX512-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vporq %zmm4, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
+;
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp
; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax
@@ -20304,668 +18912,733 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl
;
-; X86-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0:
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%ecx), %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%ecx), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll $3, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %eax, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edx,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%eax,%eax), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ebx,%ebx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl
-;
-; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%eax), %ymm0
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%eax), %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%eax), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%ecx), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll $3, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl
-;
-; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%ecx), %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%eax,8), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $24, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $60, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%edi,%edi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 96(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 92(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 104(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 100(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 112(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 108(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 120(%esp,%ebp), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%edi,%edi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ecx, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 116(%esp,%ebp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 124(%esp,%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 60(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 56(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 48(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 52(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 40(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 44(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 32(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 36(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 24(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 16(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 20(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 8(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, (%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 4(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl
+; X86-NO-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-NO-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 48(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 76(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 80(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 84(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 92(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 96(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 100(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 104(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 108(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 112(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 116(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 120(%esp,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 124(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%eax), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 48(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: sarl $31, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 88(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 104(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 100(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 48(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 108(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: sarl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%eax,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 68(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 72(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 80(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 76(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 84(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 96(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 92(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 100(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 112(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 108(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 120(%esp,%ebp), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%edi,%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 116(%esp,%ebp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 124(%esp,%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: sarxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 52(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retl
;
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%eax), %ymm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%eax), %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%eax), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%ecx), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shll $3, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%eax), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: sarl $31, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 88(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 104(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 100(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 108(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 56(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 52(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 40(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl
+;
+; X86-NO-BMI2-AVX512-LABEL: ashr_64bytes:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: pushl %esi
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 (%ecx), %zmm1
+; X86-NO-BMI2-AVX512-NEXT: movl (%edx), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vpsraq $63, %zmm1, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-NO-BMI2-AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-BMI2-AVX512-NEXT: kmovw %esi, %k1
+; X86-NO-BMI2-AVX512-NEXT: vmovdqa64 %zmm0, %zmm2
+; X86-NO-BMI2-AVX512-NEXT: vpcompressq %zmm1, %zmm2 {%k1}
+; X86-NO-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NO-BMI2-AVX512-NEXT: vpslld $3, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,0,63,0]
+; X86-NO-BMI2-AVX512-NEXT: vpand %xmm3, %xmm1, %xmm4
+; X86-NO-BMI2-AVX512-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; X86-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm2[1,2,3,4,5,6,7],zmm0[0]
+; X86-NO-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vpandn %xmm3, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vporq %zmm4, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: popl %esi
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: ashr_64bytes:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: pushl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT: movl (%ecx), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vpsraq $63, %zmm1, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: shrl $3, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %esi
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %edx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqa64 %zmm0, %zmm2
+; X86-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm1, %zmm2 {%k1}
+; X86-HAVE-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-HAVE-BMI2-AVX512-NEXT: vpslld $3, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,0,63,0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpand %xmm3, %xmm1, %xmm4
+; X86-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; X86-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm2[1,2,3,4,5,6,7],zmm0[0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vpandn %xmm3, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vporq %zmm4, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: popl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -21059,39 +19732,69 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
; X64-SSE42-NEXT: popq %rax
; X64-SSE42-NEXT: retq
;
-; X64-AVX-LABEL: ashr_64bytes_qwordOff:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: pushq %rax
-; X64-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-AVX-NEXT: movq 48(%rdi), %rax
-; X64-AVX-NEXT: movq 56(%rdi), %rcx
-; X64-AVX-NEXT: movl (%rsi), %esi
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: sarq $63, %rcx
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $7, %esi
-; X64-AVX-NEXT: vmovups -128(%rsp,%rsi,8), %xmm0
-; X64-AVX-NEXT: vmovups -112(%rsp,%rsi,8), %xmm1
-; X64-AVX-NEXT: vmovups -96(%rsp,%rsi,8), %xmm2
-; X64-AVX-NEXT: vmovups -80(%rsp,%rsi,8), %xmm3
-; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT: popq %rax
-; X64-AVX-NEXT: vzeroupper
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: ashr_64bytes_qwordOff:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: pushq %rax
+; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1
+; X64-AVX1-NEXT: movq 48(%rdi), %rax
+; X64-AVX1-NEXT: movq 56(%rdi), %rcx
+; X64-AVX1-NEXT: movl (%rsi), %esi
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: sarq $63, %rcx
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: andl $7, %esi
+; X64-AVX1-NEXT: vmovups -128(%rsp,%rsi,8), %xmm0
+; X64-AVX1-NEXT: vmovups -112(%rsp,%rsi,8), %xmm1
+; X64-AVX1-NEXT: vmovups -96(%rsp,%rsi,8), %xmm2
+; X64-AVX1-NEXT: vmovups -80(%rsp,%rsi,8), %xmm3
+; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx)
+; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
+; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
+; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT: popq %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-NO-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-NO-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm1, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-HAVE-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm1, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
;
; X86-SSE2-LABEL: ashr_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
@@ -21289,60 +19992,96 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
; X86-SSE42-NEXT: popl %ebx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: ashr_64bytes_qwordOff:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: pushl %ebx
-; X86-AVX-NEXT: pushl %edi
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: subl $128, %esp
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: vmovups (%edx), %ymm0
-; X86-AVX-NEXT: vmovups 32(%edx), %xmm1
-; X86-AVX-NEXT: movl 48(%edx), %esi
-; X86-AVX-NEXT: movl 52(%edx), %edi
-; X86-AVX-NEXT: movl 56(%edx), %ebx
-; X86-AVX-NEXT: movl 60(%edx), %edx
-; X86-AVX-NEXT: movl (%ecx), %ecx
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %ymm0, (%esp)
-; X86-AVX-NEXT: sarl $31, %edx
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $7, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
-; X86-AVX-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
-; X86-AVX-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
-; X86-AVX-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $128, %esp
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: popl %edi
-; X86-AVX-NEXT: popl %ebx
-; X86-AVX-NEXT: vzeroupper
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: ashr_64bytes_qwordOff:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: pushl %ebx
+; X86-AVX1-NEXT: pushl %edi
+; X86-AVX1-NEXT: pushl %esi
+; X86-AVX1-NEXT: subl $128, %esp
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT: vmovups (%edx), %ymm0
+; X86-AVX1-NEXT: vmovups 32(%edx), %xmm1
+; X86-AVX1-NEXT: movl 48(%edx), %esi
+; X86-AVX1-NEXT: movl 52(%edx), %edi
+; X86-AVX1-NEXT: movl 56(%edx), %ebx
+; X86-AVX1-NEXT: movl 60(%edx), %edx
+; X86-AVX1-NEXT: movl (%ecx), %ecx
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: vmovups %ymm0, (%esp)
+; X86-AVX1-NEXT: sarl $31, %edx
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: andl $7, %ecx
+; X86-AVX1-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX1-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX1-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX1-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
+; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax)
+; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax)
+; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax)
+; X86-AVX1-NEXT: vmovups %xmm0, (%eax)
+; X86-AVX1-NEXT: addl $128, %esp
+; X86-AVX1-NEXT: popl %esi
+; X86-AVX1-NEXT: popl %edi
+; X86-AVX1-NEXT: popl %ebx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-NO-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm0
+; X86-NO-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X86-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-NO-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-BMI2-AVX512-NEXT: kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm1, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm1, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%qwordOff = load i512, ptr %qwordOff.ptr, align 1
%bitOff = shl i512 %qwordOff, 6
@@ -21354,4 +20093,14 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
; X64: {{.*}}
+; X64-AVX512: {{.*}}
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X64-HAVE-SHLD-NO-BMI2-AVX512: {{.*}}
+; X64-NO-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X64-NO-SHLD-NO-BMI2-AVX512: {{.*}}
; X86: {{.*}}
+; X86-AVX512: {{.*}}
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X86-HAVE-SHLD-NO-BMI2-AVX512: {{.*}}
+; X86-NO-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X86-NO-SHLD-NO-BMI2-AVX512: {{.*}}
More information about the llvm-commits
mailing list