[llvm] [X86] Expand i512 shifts on AVX512 targets (PR #183198)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 6 02:20:04 PST 2026
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/183198
>From 19ce540ab7fdbad7bcdee12e83e4606d810926b6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 10 Dec 2025 15:39:11 +0000
Subject: [PATCH] [X86] Expand i512 shifts on AVX512 targets
Cast to vXi64 and use EXPAND/COMPRESS to left/right shift the i64 elements into place, then use FSHL/FSHR to perform the final bitshift
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 110 +
llvm/test/CodeGen/X86/bit-manip-i512.ll | 2964 ++++-------
llvm/test/CodeGen/X86/shift-i512.ll | 1895 ++-----
...lar-shift-by-byte-multiple-legalization.ll | 4399 ++++++-----------
4 files changed, 3188 insertions(+), 6180 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6bb558f4ef6da..bf1d36bf06a39 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1888,6 +1888,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::XOR, MVT::i512, Custom);
setOperationAction(ISD::ADD, MVT::i512, Custom);
setOperationAction(ISD::SUB, MVT::i512, Custom);
+ setOperationAction(ISD::SRL, MVT::i512, Custom);
+ setOperationAction(ISD::SHL, MVT::i512, Custom);
+ setOperationAction(ISD::SRA, MVT::i512, Custom);
setOperationAction(ISD::SELECT, MVT::i512, Custom);
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
@@ -2936,6 +2939,10 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
// Check for larger than legal scalar integer ops that might have been
// custom lowered to vector instruction.
switch (Opcode) {
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget);
case ISD::AND:
case ISD::OR:
case ISD::XOR:
@@ -34431,6 +34438,92 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getBitcast(VT, Res));
return;
}
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA: {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ SDValue Amt = N->getOperand(1);
+ assert(Subtarget.useAVX512Regs() && "AVX512F required");
+ assert(VT == MVT::i512 && "Unexpected VT!");
+ MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ MVT BoolVT = VecVT.changeVectorElementType(MVT::i1);
+
+ if (!mayFoldIntoVector(Src, DAG, Subtarget))
+ return;
+
+ // Early out if this will fold to a constant shift of whole byte elements.
+ // TODO: Directly lower to a shuffle?
+ if (auto *AmtC = dyn_cast<ConstantSDNode>(Amt)) {
+ assert(AmtC->getAPIntValue().ult(512) && "Out of bounds shift amount");
+ if (AmtC->getAPIntValue().urem(8) == 0)
+ return;
+ }
+
+ SDValue AmtLane = DAG.getNode(ISD::SRL, dl, MVT::i32,
+ DAG.getZExtOrTrunc(Amt, dl, MVT::i32),
+ DAG.getShiftAmountConstant(6, MVT::i32, dl));
+ AmtLane = DAG.getZExtOrTrunc(AmtLane, dl, MVT::i8);
+
+ if (auto *SrcC = dyn_cast<ConstantSDNode>(Src)) {
+ // Special case: SHL(1,Amt) --> SELECT(1<<(Amt/64), SPLAT(1<<(Amt%64)), 0)
+ if (Opc == ISD::SHL && SrcC->getAPIntValue() == 1) {
+ SDValue Bit = DAG.getConstant(1, dl, MVT::i64);
+ SDValue AmtMod = DAG.getNode(ISD::AND, dl, MVT::i64,
+ DAG.getZExtOrTrunc(Amt, dl, MVT::i64),
+ DAG.getConstant(63, dl, MVT::i64));
+ SDValue LaneMask = DAG.getNode(ISD::SHL, dl, MVT::i64, Bit, AmtLane);
+ LaneMask =
+ DAG.getBitcast(BoolVT, DAG.getZExtOrTrunc(LaneMask, dl, MVT::i8));
+ SDValue Elt = DAG.getNode(ISD::SHL, dl, MVT::i64, Bit, AmtMod);
+ SDValue Res =
+ DAG.getSelect(dl, VecVT, LaneMask, DAG.getSplat(VecVT, dl, Elt),
+ DAG.getConstant(0, dl, VecVT));
+ Results.push_back(DAG.getBitcast(VT, Res));
+ return;
+ }
+ }
+
+ // Use EXPAND/COMPRESS to shuffle the i64 elements left/right with the
+ // ShiftAmt/64 'laneshift', and then shuffle one element along to get the
+ // shifted in bits from the neighbouring element. Finally use a funnel shift
+ // with the ShiftAmt%64 'elementshift' to get the final result.
+ SDValue Mask =
+ DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+ DAG.getNode(ISD::SHL, dl, MVT::i32,
+ DAG.getAllOnesConstant(dl, MVT::i32), AmtLane));
+ Src = DAG.getBitcast(VecVT, Src);
+
+ SDValue PassThrough;
+ if (Opc == ISD::SRA) {
+ // Splat the MSB sign bit across the vector.
+ PassThrough = DAG.getNode(ISD::SRA, dl, VecVT, Src,
+ DAG.getShiftAmountConstant(63, VecVT, dl));
+ PassThrough = DAG.getVectorShuffle(VecVT, dl, PassThrough, PassThrough,
+ {7, 7, 7, 7, 7, 7, 7, 7});
+ } else {
+ PassThrough = DAG.getConstant(0, dl, VecVT);
+ }
+ SDValue A, B;
+ if (Opc == ISD::SHL) {
+ A = DAG.getNode(X86ISD::EXPAND, dl, VecVT, Src, PassThrough,
+ DAG.getBitcast(BoolVT, Mask));
+ B = DAG.getVectorShuffle(VecVT, dl, PassThrough, A,
+ {7, 8, 9, 10, 11, 12, 13, 14});
+ } else {
+ B = DAG.getNode(X86ISD::COMPRESS, dl, VecVT, Src, PassThrough,
+ DAG.getBitcast(BoolVT, Mask));
+ A = DAG.getVectorShuffle(VecVT, dl, B, PassThrough,
+ {1, 2, 3, 4, 5, 6, 7, 8});
+ }
+ // Funnel shifts use modulo shift amount so no need to explicitly mask it.
+ SDValue Res =
+ DAG.getNode(Opc == ISD::SHL ? ISD::FSHL : ISD::FSHR, dl, VecVT, A, B,
+ DAG.getSplatBuildVector(
+ VecVT, dl, DAG.getZExtOrTrunc(Amt, dl, MVT::i64)));
+ Results.push_back(DAG.getBitcast(VT, Res));
+ return;
+ }
case ISD::CTPOP: {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
@@ -48076,6 +48169,23 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
dl, DAG, DCI))
return V;
+ // Scalarize single use funnel shift.
+ // Ideally DAG would handle this similar to scalarizeExtractedBinOp.
+ if (InputVector.getOpcode() == ISD::FSHL ||
+ InputVector.getOpcode() == ISD::FSHR) {
+ if (CIdx && InputVector.hasOneUse() &&
+ TLI.isOperationLegal(InputVector.getOpcode(), VT)) {
+ SDValue LHS = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(0),
+ CIdx->getZExtValue());
+ SDValue RHS = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(1),
+ CIdx->getZExtValue());
+ SDValue Amt = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(2),
+ CIdx->getZExtValue());
+ Amt = DAG.getShiftAmountOperand(VT, Amt);
+ return DAG.getNode(InputVector.getOpcode(), dl, VT, LHS, RHS, Amt);
+ }
+ }
+
// Attempt to extract a i1 element by using MOVMSK to extract the signbits
// and then testing the relevant element.
//
diff --git a/llvm/test/CodeGen/X86/bit-manip-i512.ll b/llvm/test/CodeGen/X86/bit-manip-i512.ll
index ae0243257451a..bc0ca3d799405 100644
--- a/llvm/test/CodeGen/X86/bit-manip-i512.ll
+++ b/llvm/test/CodeGen/X86/bit-manip-i512.ll
@@ -239,330 +239,274 @@ define i512 @bext_i512(i512 %a0, i512 %idx, i512 %len) nounwind {
;
; AVX512F-LABEL: bext_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $168, %rsp
-; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512F-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %eax
-; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: negl %eax
-; AVX512F-NEXT: movslq %eax, %r10
-; AVX512F-NEXT: movq 144(%rsp,%r10), %r11
-; AVX512F-NEXT: movq 152(%rsp,%r10), %rax
-; AVX512F-NEXT: shldq %cl, %r11, %rax
-; AVX512F-NEXT: movq 136(%rsp,%r10), %rbx
-; AVX512F-NEXT: shldq %cl, %rbx, %r11
-; AVX512F-NEXT: movq 128(%rsp,%r10), %r14
-; AVX512F-NEXT: shldq %cl, %r14, %rbx
-; AVX512F-NEXT: movq 120(%rsp,%r10), %r15
-; AVX512F-NEXT: shldq %cl, %r15, %r14
-; AVX512F-NEXT: movq 112(%rsp,%r10), %r13
-; AVX512F-NEXT: shldq %cl, %r13, %r15
-; AVX512F-NEXT: movq 104(%rsp,%r10), %rbp
-; AVX512F-NEXT: shldq %cl, %rbp, %r13
-; AVX512F-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: movq 96(%rsp,%r10), %rdx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT: shlxq %rcx, %rdx, %r10
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdx, %rbp
-; AVX512F-NEXT: movq %r12, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512F-NEXT: addq $-1, %r10
-; AVX512F-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: adcq $-1, %rbp
-; AVX512F-NEXT: adcq $-1, %r13
-; AVX512F-NEXT: adcq $-1, %r15
-; AVX512F-NEXT: adcq $-1, %r14
-; AVX512F-NEXT: adcq $-1, %rbx
-; AVX512F-NEXT: adcq $-1, %r11
-; AVX512F-NEXT: adcq $-1, %rax
-; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: movl $1, %r10d
+; AVX512F-NEXT: shlxq %rdi, %r10, %r11
+; AVX512F-NEXT: shrl $6, %edi
+; AVX512F-NEXT: shlxq %rdi, %r10, %rdi
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vpbroadcastq %r11, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm2, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %edi
+; AVX512F-NEXT: vptestnmq %zmm2, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, %r10d
+; AVX512F-NEXT: movzbl %r10b, %r10d
+; AVX512F-NEXT: leal (%r10,%rdi,2), %edi
+; AVX512F-NEXT: xorl %r10d, %edi
+; AVX512F-NEXT: kmovw %edi, %k1
+; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm2
+; AVX512F-NEXT: vpextrq $1, %xmm2, %rdi
+; AVX512F-NEXT: vmovdqu64 %zmm3, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm0
+; AVX512F-NEXT: vpextrq $1, %xmm0, %r10
+; AVX512F-NEXT: vmovq %xmm2, %rbx
; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovq %xmm0, %r11
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpextrq $1, %xmm1, %r9
+; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovq %xmm1, %r8
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rcx
; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %r12d, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %r12d
-; AVX512F-NEXT: andl $56, %r12d
-; AVX512F-NEXT: movq -80(%rsp,%r12), %rsi
-; AVX512F-NEXT: movq -88(%rsp,%r12), %rdx
-; AVX512F-NEXT: movq %rdx, %r8
-; AVX512F-NEXT: shrdq %cl, %rsi, %r8
-; AVX512F-NEXT: movq -72(%rsp,%r12), %r9
-; AVX512F-NEXT: shrdq %cl, %r9, %rsi
-; AVX512F-NEXT: movq -64(%rsp,%r12), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -56(%rsp,%r12), %rdi
-; AVX512F-NEXT: shrdq %cl, %rdi, %r10
-; AVX512F-NEXT: andq %rbp, %r8
-; AVX512F-NEXT: andq %r13, %rsi
-; AVX512F-NEXT: andq %r15, %r9
-; AVX512F-NEXT: movq -48(%rsp,%r12), %r15
-; AVX512F-NEXT: shrdq %cl, %r15, %rdi
-; AVX512F-NEXT: andq %r14, %r10
-; AVX512F-NEXT: andq %rbx, %rdi
-; AVX512F-NEXT: movq -96(%rsp,%r12), %rbx
-; AVX512F-NEXT: movq -40(%rsp,%r12), %r14
-; AVX512F-NEXT: shrdq %cl, %r14, %r15
-; AVX512F-NEXT: shrdq %cl, %rdx, %rbx
-; AVX512F-NEXT: andq %r11, %r15
-; AVX512F-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512F-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512F-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512F-NEXT: movq %rbx, (%rax)
-; AVX512F-NEXT: movq %r8, 8(%rax)
-; AVX512F-NEXT: movq %rsi, 16(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %r10, 32(%rax)
+; AVX512F-NEXT: movl %r14d, %edx
+; AVX512F-NEXT: andl $63, %edx
+; AVX512F-NEXT: vmovq %rdx, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: shrl $3, %r14d
+; AVX512F-NEXT: andl $56, %r14d
+; AVX512F-NEXT: vmovdqu64 -128(%rsp,%r14), %zmm2
+; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm2[1,2,3,4,5,6,7],zmm3[0]
+; AVX512F-NEXT: vpsrlq %xmm1, %zmm2, %zmm2
+; AVX512F-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpaddq %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT: vpsllq %xmm1, %zmm3, %zmm1
+; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm2
+; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx
+; AVX512F-NEXT: andq %rdi, %rdx
+; AVX512F-NEXT: vmovq %xmm2, %rsi
+; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; AVX512F-NEXT: vpextrq $1, %xmm2, %rdi
+; AVX512F-NEXT: andq %rbx, %rsi
+; AVX512F-NEXT: andq %r10, %rdi
+; AVX512F-NEXT: vmovq %xmm2, %r10
+; AVX512F-NEXT: andq %r11, %r10
+; AVX512F-NEXT: vpextrq $1, %xmm1, %r11
+; AVX512F-NEXT: andq %r9, %r11
+; AVX512F-NEXT: vmovq %xmm1, %r9
+; AVX512F-NEXT: andq %r8, %r9
+; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %r8
+; AVX512F-NEXT: vmovq %xmm0, %rbx
+; AVX512F-NEXT: andq %rcx, %r8
+; AVX512F-NEXT: vmovq %xmm1, %rcx
+; AVX512F-NEXT: andq %rbx, %rcx
+; AVX512F-NEXT: movq %rdx, 56(%rax)
+; AVX512F-NEXT: movq %rsi, 48(%rax)
; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %r15, 48(%rax)
-; AVX512F-NEXT: movq %rcx, 56(%rax)
-; AVX512F-NEXT: addq $168, %rsp
+; AVX512F-NEXT: movq %r10, 32(%rax)
+; AVX512F-NEXT: movq %r11, 8(%rax)
+; AVX512F-NEXT: movq %r9, (%rax)
+; AVX512F-NEXT: movq %r8, 24(%rax)
+; AVX512F-NEXT: movq %rcx, 16(%rax)
+; AVX512F-NEXT: addq $8, %rsp
; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: bext_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: subq $152, %rsp
-; AVX512VL-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512VL-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %eax
-; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: negl %eax
-; AVX512VL-NEXT: movslq %eax, %r10
-; AVX512VL-NEXT: movq 128(%rsp,%r10), %r11
-; AVX512VL-NEXT: movq 136(%rsp,%r10), %rax
-; AVX512VL-NEXT: shldq %cl, %r11, %rax
-; AVX512VL-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VL-NEXT: movq 120(%rsp,%r10), %rbx
-; AVX512VL-NEXT: shldq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq 112(%rsp,%r10), %r14
-; AVX512VL-NEXT: shldq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq 104(%rsp,%r10), %r15
-; AVX512VL-NEXT: movq 96(%rsp,%r10), %r12
-; AVX512VL-NEXT: movq 80(%rsp,%r10), %rax
-; AVX512VL-NEXT: movq 88(%rsp,%r10), %rbp
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: shldq %cl, %r15, %r14
-; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512VL-NEXT: shldq %cl, %r12, %r15
-; AVX512VL-NEXT: shldq %cl, %rbp, %r12
+; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: shlxq %rcx, %rax, %r13
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %rax, %rbp
-; AVX512VL-NEXT: addq $-1, %r13
-; AVX512VL-NEXT: adcq $-1, %rbp
-; AVX512VL-NEXT: adcq $-1, %r12
-; AVX512VL-NEXT: adcq $-1, %r15
-; AVX512VL-NEXT: adcq $-1, %r14
-; AVX512VL-NEXT: adcq $-1, %rbx
-; AVX512VL-NEXT: adcq $-1, %r11
-; AVX512VL-NEXT: adcq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movl $1, %r11d
+; AVX512VL-NEXT: shlxq %r10, %r11, %r14
+; AVX512VL-NEXT: shrl $6, %r10d
+; AVX512VL-NEXT: shlxq %r10, %r11, %r10
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT: kmovd %r10d, %k1
+; AVX512VL-NEXT: vpbroadcastq %r14, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT: vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VL-NEXT: kmovd %k0, %r10d
+; AVX512VL-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %r11d
+; AVX512VL-NEXT: leal (%r11,%r10,2), %r10d
+; AVX512VL-NEXT: xorl %r11d, %r10d
+; AVX512VL-NEXT: kmovd %r10d, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VL-NEXT: vextracti32x4 $3, %zmm2, %xmm1
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %r11
+; AVX512VL-NEXT: vmovq %xmm1, %r10
+; AVX512VL-NEXT: vextracti32x4 $2, %zmm2, %xmm1
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: vmovdqu %ymm3, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rbx
+; AVX512VL-NEXT: vmovdqu %ymm3, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovq %xmm1, %r9
; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512VL-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vpextrq $1, %xmm2, %r8
+; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %r10d, %ecx
+; AVX512VL-NEXT: movl %edi, %ecx
; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %r10d
-; AVX512VL-NEXT: andl $56, %r10d
-; AVX512VL-NEXT: movq -96(%rsp,%r10), %rdx
-; AVX512VL-NEXT: movq -104(%rsp,%r10), %r8
-; AVX512VL-NEXT: movq %r8, %rsi
-; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -88(%rsp,%r10), %rdi
-; AVX512VL-NEXT: shrdq %cl, %rdi, %rdx
-; AVX512VL-NEXT: andq %rbp, %rsi
-; AVX512VL-NEXT: movq -80(%rsp,%r10), %r9
-; AVX512VL-NEXT: shrdq %cl, %r9, %rdi
-; AVX512VL-NEXT: andq %r12, %rdx
-; AVX512VL-NEXT: movq -72(%rsp,%r10), %r12
-; AVX512VL-NEXT: shrdq %cl, %r12, %r9
-; AVX512VL-NEXT: andq %r15, %rdi
-; AVX512VL-NEXT: movq -64(%rsp,%r10), %r15
-; AVX512VL-NEXT: shrdq %cl, %r15, %r12
-; AVX512VL-NEXT: andq %r14, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r10), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %r15
-; AVX512VL-NEXT: andq %rbx, %r12
-; AVX512VL-NEXT: movq -112(%rsp,%r10), %r10
-; AVX512VL-NEXT: shrdq %cl, %r8, %r10
-; AVX512VL-NEXT: andq %r11, %r15
-; AVX512VL-NEXT: andq %r13, %r10
-; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512VL-NEXT: movq %r10, (%rax)
-; AVX512VL-NEXT: movq %rsi, 8(%rax)
-; AVX512VL-NEXT: movq %rdx, 16(%rax)
-; AVX512VL-NEXT: movq %rdi, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 32(%rax)
-; AVX512VL-NEXT: movq %r12, 40(%rax)
-; AVX512VL-NEXT: movq %r15, 48(%rax)
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT: shrl $3, %edi
+; AVX512VL-NEXT: andl $56, %edi
+; AVX512VL-NEXT: vmovdqu64 -128(%rsp,%rdi), %zmm1
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm3 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VL-NEXT: vpsrlq %xmm0, %zmm1, %zmm1
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm3, %zmm3
+; AVX512VL-NEXT: vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vextracti128 $1, %ymm2, %xmm3
+; AVX512VL-NEXT: vpextrq $1, %xmm3, %rdx
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rcx
+; AVX512VL-NEXT: vmovq %xmm2, %rsi
+; AVX512VL-NEXT: andq %r11, %rcx
+; AVX512VL-NEXT: vmovq %xmm1, %rdi
+; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %r11
+; AVX512VL-NEXT: andq %r10, %rdi
+; AVX512VL-NEXT: andq %rbx, %r11
+; AVX512VL-NEXT: vmovq %xmm1, %r10
+; AVX512VL-NEXT: andq %r9, %r10
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %r9
+; AVX512VL-NEXT: andq %r8, %r9
+; AVX512VL-NEXT: vmovq %xmm0, %r8
+; AVX512VL-NEXT: andq %rsi, %r8
+; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT: vmovq %xmm3, %rbx
+; AVX512VL-NEXT: andq %rdx, %rsi
+; AVX512VL-NEXT: vmovq %xmm0, %rdx
+; AVX512VL-NEXT: andq %rbx, %rdx
; AVX512VL-NEXT: movq %rcx, 56(%rax)
-; AVX512VL-NEXT: addq $152, %rsp
+; AVX512VL-NEXT: movq %rdi, 48(%rax)
+; AVX512VL-NEXT: movq %r11, 40(%rax)
+; AVX512VL-NEXT: movq %r10, 32(%rax)
+; AVX512VL-NEXT: movq %r9, 8(%rax)
+; AVX512VL-NEXT: movq %r8, (%rax)
+; AVX512VL-NEXT: movq %rsi, 24(%rax)
+; AVX512VL-NEXT: movq %rdx, 16(%rax)
+; AVX512VL-NEXT: addq $8, %rsp
; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: bext_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rbp
-; AVX512VBMI-NEXT: pushq %r15
; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: subq $152, %rsp
-; AVX512VBMI-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512VBMI-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT: movl %eax, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %eax
-; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: negl %eax
-; AVX512VBMI-NEXT: movslq %eax, %r10
-; AVX512VBMI-NEXT: movq 128(%rsp,%r10), %r11
-; AVX512VBMI-NEXT: movq 136(%rsp,%r10), %rax
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rax
-; AVX512VBMI-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VBMI-NEXT: movq 120(%rsp,%r10), %rbx
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq 112(%rsp,%r10), %r14
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq 104(%rsp,%r10), %r15
-; AVX512VBMI-NEXT: movq 96(%rsp,%r10), %r12
-; AVX512VBMI-NEXT: movq 80(%rsp,%r10), %rax
-; AVX512VBMI-NEXT: movq 88(%rsp,%r10), %rbp
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: shldq %cl, %r15, %r14
-; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512VBMI-NEXT: shldq %cl, %r12, %r15
-; AVX512VBMI-NEXT: shldq %cl, %rbp, %r12
+; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: shlxq %rcx, %rax, %r13
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %rax, %rbp
-; AVX512VBMI-NEXT: addq $-1, %r13
-; AVX512VBMI-NEXT: adcq $-1, %rbp
-; AVX512VBMI-NEXT: adcq $-1, %r12
-; AVX512VBMI-NEXT: adcq $-1, %r15
-; AVX512VBMI-NEXT: adcq $-1, %r14
-; AVX512VBMI-NEXT: adcq $-1, %rbx
-; AVX512VBMI-NEXT: adcq $-1, %r11
-; AVX512VBMI-NEXT: adcq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movl $1, %r11d
+; AVX512VBMI-NEXT: shlxq %r10, %r11, %r14
+; AVX512VBMI-NEXT: shrl $6, %r10d
+; AVX512VBMI-NEXT: shlxq %r10, %r11, %r10
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX512VBMI-NEXT: kmovd %r10d, %k1
+; AVX512VBMI-NEXT: vpbroadcastq %r14, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VBMI-NEXT: vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VBMI-NEXT: kmovd %k0, %r10d
+; AVX512VBMI-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512VBMI-NEXT: kmovb %k0, %r11d
+; AVX512VBMI-NEXT: leal (%r11,%r10,2), %r10d
+; AVX512VBMI-NEXT: xorl %r11d, %r10d
+; AVX512VBMI-NEXT: kmovd %r10d, %k1
+; AVX512VBMI-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm2, %xmm1
+; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %r11
+; AVX512VBMI-NEXT: vmovq %xmm1, %r10
+; AVX512VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm2, %xmm3
+; AVX512VBMI-NEXT: movq %rbx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vpextrq $1, %xmm3, %rbx
+; AVX512VBMI-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vpextrq $1, %xmm2, %r9
; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512VBMI-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovq %xmm3, %r8
+; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %r10d, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %r10d
-; AVX512VBMI-NEXT: andl $56, %r10d
-; AVX512VBMI-NEXT: movq -96(%rsp,%r10), %rdx
-; AVX512VBMI-NEXT: movq -104(%rsp,%r10), %r8
-; AVX512VBMI-NEXT: movq %r8, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -88(%rsp,%r10), %rdi
-; AVX512VBMI-NEXT: shrdq %cl, %rdi, %rdx
-; AVX512VBMI-NEXT: andq %rbp, %rsi
-; AVX512VBMI-NEXT: movq -80(%rsp,%r10), %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdi
-; AVX512VBMI-NEXT: andq %r12, %rdx
-; AVX512VBMI-NEXT: movq -72(%rsp,%r10), %r12
-; AVX512VBMI-NEXT: shrdq %cl, %r12, %r9
-; AVX512VBMI-NEXT: andq %r15, %rdi
-; AVX512VBMI-NEXT: movq -64(%rsp,%r10), %r15
-; AVX512VBMI-NEXT: shrdq %cl, %r15, %r12
-; AVX512VBMI-NEXT: andq %r14, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r10), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %r15
-; AVX512VBMI-NEXT: andq %rbx, %r12
-; AVX512VBMI-NEXT: movq -112(%rsp,%r10), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %r10
-; AVX512VBMI-NEXT: andq %r11, %r15
-; AVX512VBMI-NEXT: andq %r13, %r10
-; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512VBMI-NEXT: movq %r10, (%rax)
-; AVX512VBMI-NEXT: movq %rsi, 8(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 16(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 24(%rax)
-; AVX512VBMI-NEXT: movq %r9, 32(%rax)
-; AVX512VBMI-NEXT: movq %r12, 40(%rax)
-; AVX512VBMI-NEXT: movq %r15, 48(%rax)
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX512VBMI-NEXT: shrl $3, %edi
+; AVX512VBMI-NEXT: andl $56, %edi
+; AVX512VBMI-NEXT: vmovdqu64 -128(%rsp,%rdi), %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm0, %zmm3, %zmm1
+; AVX512VBMI-NEXT: vextracti128 $1, %ymm2, %xmm0
+; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm1, %xmm3
+; AVX512VBMI-NEXT: vpextrq $1, %xmm3, %rcx
+; AVX512VBMI-NEXT: vmovq %xmm2, %rsi
+; AVX512VBMI-NEXT: andq %r11, %rcx
+; AVX512VBMI-NEXT: vmovq %xmm3, %rdi
+; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm1, %xmm2
+; AVX512VBMI-NEXT: vpextrq $1, %xmm2, %r11
+; AVX512VBMI-NEXT: andq %r10, %rdi
+; AVX512VBMI-NEXT: andq %rbx, %r11
+; AVX512VBMI-NEXT: vmovq %xmm2, %r10
+; AVX512VBMI-NEXT: andq %r8, %r10
+; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %r8
+; AVX512VBMI-NEXT: andq %r9, %r8
+; AVX512VBMI-NEXT: vmovq %xmm1, %r9
+; AVX512VBMI-NEXT: andq %rsi, %r9
+; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm1
+; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512VBMI-NEXT: vmovq %xmm0, %rbx
+; AVX512VBMI-NEXT: andq %rdx, %rsi
+; AVX512VBMI-NEXT: vmovq %xmm1, %rdx
+; AVX512VBMI-NEXT: andq %rbx, %rdx
; AVX512VBMI-NEXT: movq %rcx, 56(%rax)
-; AVX512VBMI-NEXT: addq $152, %rsp
+; AVX512VBMI-NEXT: movq %rdi, 48(%rax)
+; AVX512VBMI-NEXT: movq %r11, 40(%rax)
+; AVX512VBMI-NEXT: movq %r10, 32(%rax)
+; AVX512VBMI-NEXT: movq %r8, 8(%rax)
+; AVX512VBMI-NEXT: movq %r9, (%rax)
+; AVX512VBMI-NEXT: movq %rsi, 24(%rax)
+; AVX512VBMI-NEXT: movq %rdx, 16(%rax)
+; AVX512VBMI-NEXT: addq $8, %rsp
; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
-; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%bit = shl i512 1, %len
@@ -786,292 +730,117 @@ define i512 @bext_i512_vector(<8 x i64> %v0, i512 %idx, i512 %len) nounwind {
;
; AVX512F-LABEL: bext_i512_vector:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $152, %rsp
-; AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %eax
-; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: negl %eax
-; AVX512F-NEXT: cltq
-; AVX512F-NEXT: movq (%rsp,%rax), %r8
-; AVX512F-NEXT: movq 8(%rsp,%rax), %rdx
-; AVX512F-NEXT: shldq %cl, %r8, %rdx
-; AVX512F-NEXT: movq -8(%rsp,%rax), %r9
-; AVX512F-NEXT: shldq %cl, %r9, %r8
-; AVX512F-NEXT: movq -16(%rsp,%rax), %r10
-; AVX512F-NEXT: shldq %cl, %r10, %r9
-; AVX512F-NEXT: movq -24(%rsp,%rax), %rbx
-; AVX512F-NEXT: shldq %cl, %rbx, %r10
-; AVX512F-NEXT: movq -32(%rsp,%rax), %r14
-; AVX512F-NEXT: shldq %cl, %r14, %rbx
-; AVX512F-NEXT: movq -48(%rsp,%rax), %r11
-; AVX512F-NEXT: movq -40(%rsp,%rax), %r15
-; AVX512F-NEXT: shldq %cl, %r15, %r14
-; AVX512F-NEXT: shldq %cl, %r11, %r15
-; AVX512F-NEXT: shlxq %rcx, %r11, %rax
-; AVX512F-NEXT: addq $-1, %rax
-; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: adcq $-1, %r15
-; AVX512F-NEXT: adcq $-1, %r14
-; AVX512F-NEXT: adcq $-1, %rbx
-; AVX512F-NEXT: adcq $-1, %r10
-; AVX512F-NEXT: adcq $-1, %r9
-; AVX512F-NEXT: adcq $-1, %r8
-; AVX512F-NEXT: adcq $-1, %rdx
-; AVX512F-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %esi, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %esi
-; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: movq 32(%rsp,%rsi), %r12
-; AVX512F-NEXT: movq 24(%rsp,%rsi), %r13
-; AVX512F-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: shrdq %cl, %r12, %r13
-; AVX512F-NEXT: movq 40(%rsp,%rsi), %rbp
-; AVX512F-NEXT: shrdq %cl, %rbp, %r12
-; AVX512F-NEXT: movq 48(%rsp,%rsi), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %rbp
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq 56(%rsp,%rsi), %rdi
-; AVX512F-NEXT: shrdq %cl, %rdi, %r11
-; AVX512F-NEXT: andq %r15, %r13
-; AVX512F-NEXT: andq %r14, %r12
-; AVX512F-NEXT: andq %rbx, %rbp
-; AVX512F-NEXT: movq 64(%rsp,%rsi), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %rdi
-; AVX512F-NEXT: andq %r10, %r11
-; AVX512F-NEXT: andq %r9, %rdi
-; AVX512F-NEXT: movq 16(%rsp,%rsi), %r9
-; AVX512F-NEXT: movq 72(%rsp,%rsi), %rsi
-; AVX512F-NEXT: shrdq %cl, %rsi, %rbx
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: andq %r8, %rbx
-; AVX512F-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512F-NEXT: shrxq %rcx, %rsi, %rcx
-; AVX512F-NEXT: andq %rdx, %rcx
-; AVX512F-NEXT: movq %r9, (%rax)
-; AVX512F-NEXT: movq %r13, 8(%rax)
-; AVX512F-NEXT: movq %r12, 16(%rax)
-; AVX512F-NEXT: movq %rbp, 24(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %rbx, 48(%rax)
-; AVX512F-NEXT: movq %rcx, 56(%rax)
-; AVX512F-NEXT: addq $152, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT: movl $1, %edx
+; AVX512F-NEXT: shlxq %rcx, %rdx, %rdi
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: shlxq %rcx, %rdx, %rcx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512F-NEXT: movl %esi, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm2
+; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT: shrl $6, %esi
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %esi, %ecx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm0, %zmm4
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1
+; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT: vpcmpltuq %zmm1, %zmm3, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 & (zmm0 | zmm4)
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: bext_i512_vector:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: subq $136, %rsp
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} xmm2 = [1,0,0,0]
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %eax
-; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: negl %eax
-; AVX512VL-NEXT: cltq
-; AVX512VL-NEXT: movq -16(%rsp,%rax), %r8
-; AVX512VL-NEXT: movq -8(%rsp,%rax), %rdx
-; AVX512VL-NEXT: shldq %cl, %r8, %rdx
-; AVX512VL-NEXT: movq -24(%rsp,%rax), %r9
-; AVX512VL-NEXT: shldq %cl, %r9, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%rax), %r10
-; AVX512VL-NEXT: shldq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -40(%rsp,%rax), %rbx
-; AVX512VL-NEXT: shldq %cl, %rbx, %r10
-; AVX512VL-NEXT: movq -48(%rsp,%rax), %r14
-; AVX512VL-NEXT: shldq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -64(%rsp,%rax), %r11
-; AVX512VL-NEXT: movq -56(%rsp,%rax), %r12
-; AVX512VL-NEXT: shldq %cl, %r12, %r14
-; AVX512VL-NEXT: shldq %cl, %r11, %r12
-; AVX512VL-NEXT: shlxq %rcx, %r11, %r11
-; AVX512VL-NEXT: addq $-1, %r11
-; AVX512VL-NEXT: adcq $-1, %r12
-; AVX512VL-NEXT: adcq $-1, %r14
-; AVX512VL-NEXT: adcq $-1, %rbx
-; AVX512VL-NEXT: adcq $-1, %r10
-; AVX512VL-NEXT: adcq $-1, %r9
-; AVX512VL-NEXT: adcq $-1, %r8
-; AVX512VL-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: adcq $-1, %rdx
-; AVX512VL-NEXT: vmovups %ymm0, (%rsp)
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %esi
-; AVX512VL-NEXT: andl $56, %esi
-; AVX512VL-NEXT: movq 16(%rsp,%rsi), %r15
-; AVX512VL-NEXT: movq 8(%rsp,%rsi), %rbp
-; AVX512VL-NEXT: movq %rbp, %r13
-; AVX512VL-NEXT: shrdq %cl, %r15, %r13
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq 24(%rsp,%rsi), %rdi
-; AVX512VL-NEXT: shrdq %cl, %rdi, %r15
-; AVX512VL-NEXT: andq %r12, %r13
-; AVX512VL-NEXT: movq 32(%rsp,%rsi), %r12
-; AVX512VL-NEXT: shrdq %cl, %r12, %rdi
-; AVX512VL-NEXT: andq %r14, %r15
-; AVX512VL-NEXT: movq 40(%rsp,%rsi), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %r12
-; AVX512VL-NEXT: andq %rbx, %rdi
-; AVX512VL-NEXT: movq 48(%rsp,%rsi), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r14
-; AVX512VL-NEXT: andq %r10, %r12
-; AVX512VL-NEXT: movq 56(%rsp,%rsi), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %rbx
-; AVX512VL-NEXT: andq %r9, %r14
-; AVX512VL-NEXT: movq (%rsp,%rsi), %rsi
-; AVX512VL-NEXT: shrdq %cl, %rbp, %rsi
-; AVX512VL-NEXT: andq %r8, %rbx
-; AVX512VL-NEXT: andq %r11, %rsi
-; AVX512VL-NEXT: shrxq %rcx, %r10, %rcx
-; AVX512VL-NEXT: andq %rdx, %rcx
-; AVX512VL-NEXT: movq %rsi, (%rax)
-; AVX512VL-NEXT: movq %r13, 8(%rax)
-; AVX512VL-NEXT: movq %r15, 16(%rax)
-; AVX512VL-NEXT: movq %rdi, 24(%rax)
-; AVX512VL-NEXT: movq %r12, 32(%rax)
-; AVX512VL-NEXT: movq %r14, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 48(%rax)
-; AVX512VL-NEXT: movq %rcx, 56(%rax)
-; AVX512VL-NEXT: addq $136, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VL-NEXT: movl $1, %edx
+; AVX512VL-NEXT: shlxq %rcx, %rdx, %rdi
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: shlxq %rcx, %rdx, %rcx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT: vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %edx
+; AVX512VL-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT: shrl $6, %esi
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %esi, %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpand %xmm1, %xmm3, %xmm4
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm0, %zmm4
+; AVX512VL-NEXT: vpandn %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 & (zmm0 | zmm4)
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: bext_i512_vector:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rbp
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: subq $136, %rsp
-; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm2 = [1,0,0,0]
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT: movl %eax, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %eax
-; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: negl %eax
-; AVX512VBMI-NEXT: cltq
-; AVX512VBMI-NEXT: movq -16(%rsp,%rax), %r8
-; AVX512VBMI-NEXT: movq -8(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT: shldq %cl, %r8, %rdx
-; AVX512VBMI-NEXT: movq -24(%rsp,%rax), %r9
-; AVX512VBMI-NEXT: shldq %cl, %r9, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%rax), %r10
-; AVX512VBMI-NEXT: shldq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -40(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %r10
-; AVX512VBMI-NEXT: movq -48(%rsp,%rax), %r14
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -64(%rsp,%rax), %r11
-; AVX512VBMI-NEXT: movq -56(%rsp,%rax), %r12
-; AVX512VBMI-NEXT: shldq %cl, %r12, %r14
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r12
-; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r11
-; AVX512VBMI-NEXT: addq $-1, %r11
-; AVX512VBMI-NEXT: adcq $-1, %r12
-; AVX512VBMI-NEXT: adcq $-1, %r14
-; AVX512VBMI-NEXT: adcq $-1, %rbx
-; AVX512VBMI-NEXT: adcq $-1, %r10
-; AVX512VBMI-NEXT: adcq $-1, %r9
-; AVX512VBMI-NEXT: adcq $-1, %r8
-; AVX512VBMI-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: adcq $-1, %rdx
-; AVX512VBMI-NEXT: vmovups %ymm0, (%rsp)
-; AVX512VBMI-NEXT: movl %esi, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: movq 16(%rsp,%rsi), %r15
-; AVX512VBMI-NEXT: movq 8(%rsp,%rsi), %rbp
-; AVX512VBMI-NEXT: movq %rbp, %r13
-; AVX512VBMI-NEXT: shrdq %cl, %r15, %r13
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq 24(%rsp,%rsi), %rdi
-; AVX512VBMI-NEXT: shrdq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: andq %r12, %r13
-; AVX512VBMI-NEXT: movq 32(%rsp,%rsi), %r12
-; AVX512VBMI-NEXT: shrdq %cl, %r12, %rdi
-; AVX512VBMI-NEXT: andq %r14, %r15
-; AVX512VBMI-NEXT: movq 40(%rsp,%rsi), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %r12
-; AVX512VBMI-NEXT: andq %rbx, %rdi
-; AVX512VBMI-NEXT: movq 48(%rsp,%rsi), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r14
-; AVX512VBMI-NEXT: andq %r10, %r12
-; AVX512VBMI-NEXT: movq 56(%rsp,%rsi), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %rbx
-; AVX512VBMI-NEXT: andq %r9, %r14
-; AVX512VBMI-NEXT: movq (%rsp,%rsi), %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rbp, %rsi
-; AVX512VBMI-NEXT: andq %r8, %rbx
-; AVX512VBMI-NEXT: andq %r11, %rsi
-; AVX512VBMI-NEXT: shrxq %rcx, %r10, %rcx
-; AVX512VBMI-NEXT: andq %rdx, %rcx
-; AVX512VBMI-NEXT: movq %rsi, (%rax)
-; AVX512VBMI-NEXT: movq %r13, 8(%rax)
-; AVX512VBMI-NEXT: movq %r15, 16(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 24(%rax)
-; AVX512VBMI-NEXT: movq %r12, 32(%rax)
-; AVX512VBMI-NEXT: movq %r14, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rax)
-; AVX512VBMI-NEXT: movq %rcx, 56(%rax)
-; AVX512VBMI-NEXT: addq $136, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
-; AVX512VBMI-NEXT: popq %rbp
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VBMI-NEXT: movl $1, %edx
+; AVX512VBMI-NEXT: shlxq %rcx, %rdx, %rdi
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: shlxq %rcx, %rdx, %rcx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VBMI-NEXT: vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VBMI-NEXT: kmovd %k0, %ecx
+; AVX512VBMI-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512VBMI-NEXT: kmovb %k0, %edx
+; AVX512VBMI-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VBMI-NEXT: xorl %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm1
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm3, %zmm0
+; AVX512VBMI-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = bitcast <8 x i64> %v0 to i512
@@ -1302,297 +1071,120 @@ define i512 @bext_i512_load(ptr %p0, i512 %idx, i512 %len) nounwind {
;
; AVX512F-LABEL: bext_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $152, %rsp
-; AVX512F-NEXT: vmovups (%rsi), %zmm0
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovss {{.*#+}} xmm2 = [1,0,0,0]
-; AVX512F-NEXT: vmovups %zmm2, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movl %eax, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %eax
-; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: negl %eax
-; AVX512F-NEXT: cltq
-; AVX512F-NEXT: movq (%rsp,%rax), %r8
-; AVX512F-NEXT: movq 8(%rsp,%rax), %rsi
-; AVX512F-NEXT: shldq %cl, %r8, %rsi
-; AVX512F-NEXT: movq -8(%rsp,%rax), %r9
-; AVX512F-NEXT: shldq %cl, %r9, %r8
-; AVX512F-NEXT: movq -16(%rsp,%rax), %r10
-; AVX512F-NEXT: shldq %cl, %r10, %r9
-; AVX512F-NEXT: movq -24(%rsp,%rax), %rbx
-; AVX512F-NEXT: shldq %cl, %rbx, %r10
-; AVX512F-NEXT: movq -32(%rsp,%rax), %r14
-; AVX512F-NEXT: shldq %cl, %r14, %rbx
-; AVX512F-NEXT: movq -48(%rsp,%rax), %r11
-; AVX512F-NEXT: movq -40(%rsp,%rax), %r15
-; AVX512F-NEXT: shldq %cl, %r15, %r14
-; AVX512F-NEXT: shldq %cl, %r11, %r15
-; AVX512F-NEXT: shlxq %rcx, %r11, %rax
-; AVX512F-NEXT: addq $-1, %rax
-; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: adcq $-1, %r15
-; AVX512F-NEXT: adcq $-1, %r14
-; AVX512F-NEXT: adcq $-1, %rbx
-; AVX512F-NEXT: adcq $-1, %r10
-; AVX512F-NEXT: adcq $-1, %r9
-; AVX512F-NEXT: adcq $-1, %r8
-; AVX512F-NEXT: adcq $-1, %rsi
-; AVX512F-NEXT: vmovups %zmm1, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %edx, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq 32(%rsp,%rdx), %r12
-; AVX512F-NEXT: movq 24(%rsp,%rdx), %r13
-; AVX512F-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: shrdq %cl, %r12, %r13
-; AVX512F-NEXT: movq 40(%rsp,%rdx), %rbp
-; AVX512F-NEXT: shrdq %cl, %rbp, %r12
-; AVX512F-NEXT: movq 48(%rsp,%rdx), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %rbp
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq 56(%rsp,%rdx), %rdi
-; AVX512F-NEXT: shrdq %cl, %rdi, %r11
-; AVX512F-NEXT: andq %r15, %r13
-; AVX512F-NEXT: andq %r14, %r12
-; AVX512F-NEXT: andq %rbx, %rbp
-; AVX512F-NEXT: movq 64(%rsp,%rdx), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %rdi
-; AVX512F-NEXT: andq %r10, %r11
-; AVX512F-NEXT: andq %r9, %rdi
-; AVX512F-NEXT: movq 16(%rsp,%rdx), %r9
-; AVX512F-NEXT: movq 72(%rsp,%rdx), %rdx
-; AVX512F-NEXT: shrdq %cl, %rdx, %rbx
-; AVX512F-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: andq %r8, %rbx
-; AVX512F-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512F-NEXT: shrxq %rcx, %rdx, %rcx
-; AVX512F-NEXT: andq %rsi, %rcx
-; AVX512F-NEXT: movq %r9, (%rax)
-; AVX512F-NEXT: movq %r13, 8(%rax)
-; AVX512F-NEXT: movq %r12, 16(%rax)
-; AVX512F-NEXT: movq %rbp, 24(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %rdi, 40(%rax)
-; AVX512F-NEXT: movq %rbx, 48(%rax)
-; AVX512F-NEXT: movq %rcx, 56(%rax)
-; AVX512F-NEXT: addq $152, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: movl $1, %esi
+; AVX512F-NEXT: shlxq %rcx, %rsi, %rdi
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: shlxq %rcx, %rsi, %rcx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512F-NEXT: movl %edx, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm2
+; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm0, %zmm4
+; AVX512F-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 = -1
+; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT: vpcmpltuq %zmm1, %zmm3, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllq %xmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 & (zmm0 | zmm4)
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: bext_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: subq $136, %rsp
-; AVX512VL-NEXT: vmovups (%rsi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} xmm3 = [1,0,0,0]
-; AVX512VL-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %eax
-; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: negl %eax
-; AVX512VL-NEXT: cltq
-; AVX512VL-NEXT: movq -16(%rsp,%rax), %r8
-; AVX512VL-NEXT: movq -8(%rsp,%rax), %rsi
-; AVX512VL-NEXT: shldq %cl, %r8, %rsi
-; AVX512VL-NEXT: movq -24(%rsp,%rax), %r9
-; AVX512VL-NEXT: shldq %cl, %r9, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%rax), %r10
-; AVX512VL-NEXT: shldq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -40(%rsp,%rax), %rbx
-; AVX512VL-NEXT: shldq %cl, %rbx, %r10
-; AVX512VL-NEXT: movq -48(%rsp,%rax), %r14
-; AVX512VL-NEXT: shldq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -64(%rsp,%rax), %r11
-; AVX512VL-NEXT: movq -56(%rsp,%rax), %r12
-; AVX512VL-NEXT: shldq %cl, %r12, %r14
-; AVX512VL-NEXT: shldq %cl, %r11, %r12
-; AVX512VL-NEXT: shlxq %rcx, %r11, %r11
-; AVX512VL-NEXT: addq $-1, %r11
-; AVX512VL-NEXT: adcq $-1, %r12
-; AVX512VL-NEXT: adcq $-1, %r14
-; AVX512VL-NEXT: adcq $-1, %rbx
-; AVX512VL-NEXT: adcq $-1, %r10
-; AVX512VL-NEXT: adcq $-1, %r9
-; AVX512VL-NEXT: adcq $-1, %r8
-; AVX512VL-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: adcq $-1, %rsi
-; AVX512VL-NEXT: vmovups %ymm0, (%rsp)
-; AVX512VL-NEXT: movl %edx, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq 16(%rsp,%rdx), %r15
-; AVX512VL-NEXT: movq 8(%rsp,%rdx), %rbp
-; AVX512VL-NEXT: movq %rbp, %r13
-; AVX512VL-NEXT: shrdq %cl, %r15, %r13
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq 24(%rsp,%rdx), %rdi
-; AVX512VL-NEXT: shrdq %cl, %rdi, %r15
-; AVX512VL-NEXT: andq %r12, %r13
-; AVX512VL-NEXT: movq 32(%rsp,%rdx), %r12
-; AVX512VL-NEXT: shrdq %cl, %r12, %rdi
-; AVX512VL-NEXT: andq %r14, %r15
-; AVX512VL-NEXT: movq 40(%rsp,%rdx), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %r12
-; AVX512VL-NEXT: andq %rbx, %rdi
-; AVX512VL-NEXT: movq 48(%rsp,%rdx), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r14
-; AVX512VL-NEXT: andq %r10, %r12
-; AVX512VL-NEXT: movq 56(%rsp,%rdx), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %rbx
-; AVX512VL-NEXT: andq %r9, %r14
-; AVX512VL-NEXT: movq (%rsp,%rdx), %rdx
-; AVX512VL-NEXT: shrdq %cl, %rbp, %rdx
-; AVX512VL-NEXT: andq %r8, %rbx
-; AVX512VL-NEXT: andq %r11, %rdx
-; AVX512VL-NEXT: shrxq %rcx, %r10, %rcx
-; AVX512VL-NEXT: andq %rsi, %rcx
-; AVX512VL-NEXT: movq %rdx, (%rax)
-; AVX512VL-NEXT: movq %r13, 8(%rax)
-; AVX512VL-NEXT: movq %r15, 16(%rax)
-; AVX512VL-NEXT: movq %rdi, 24(%rax)
-; AVX512VL-NEXT: movq %r12, 32(%rax)
-; AVX512VL-NEXT: movq %r14, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 48(%rax)
-; AVX512VL-NEXT: movq %rcx, 56(%rax)
-; AVX512VL-NEXT: addq $136, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: movl $1, %esi
+; AVX512VL-NEXT: shlxq %rcx, %rsi, %rdi
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: shlxq %rcx, %rsi, %rcx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT: vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %esi
+; AVX512VL-NEXT: leal (%rsi,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %esi, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VL-NEXT: movl %edx, %ecx
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpand %xmm1, %xmm3, %xmm4
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm0, %zmm4
+; AVX512VL-NEXT: vpandn %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 & (zmm0 | zmm4)
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: bext_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rbp
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: subq $136, %rsp
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm3 = [1,0,0,0]
-; AVX512VBMI-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT: movl %eax, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %eax
-; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: negl %eax
-; AVX512VBMI-NEXT: cltq
-; AVX512VBMI-NEXT: movq -16(%rsp,%rax), %r8
-; AVX512VBMI-NEXT: movq -8(%rsp,%rax), %rsi
-; AVX512VBMI-NEXT: shldq %cl, %r8, %rsi
-; AVX512VBMI-NEXT: movq -24(%rsp,%rax), %r9
-; AVX512VBMI-NEXT: shldq %cl, %r9, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%rax), %r10
-; AVX512VBMI-NEXT: shldq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -40(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %r10
-; AVX512VBMI-NEXT: movq -48(%rsp,%rax), %r14
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -64(%rsp,%rax), %r11
-; AVX512VBMI-NEXT: movq -56(%rsp,%rax), %r12
-; AVX512VBMI-NEXT: shldq %cl, %r12, %r14
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r12
-; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r11
-; AVX512VBMI-NEXT: addq $-1, %r11
-; AVX512VBMI-NEXT: adcq $-1, %r12
-; AVX512VBMI-NEXT: adcq $-1, %r14
-; AVX512VBMI-NEXT: adcq $-1, %rbx
-; AVX512VBMI-NEXT: adcq $-1, %r10
-; AVX512VBMI-NEXT: adcq $-1, %r9
-; AVX512VBMI-NEXT: adcq $-1, %r8
-; AVX512VBMI-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: adcq $-1, %rsi
-; AVX512VBMI-NEXT: vmovups %ymm0, (%rsp)
-; AVX512VBMI-NEXT: movl %edx, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq 16(%rsp,%rdx), %r15
-; AVX512VBMI-NEXT: movq 8(%rsp,%rdx), %rbp
-; AVX512VBMI-NEXT: movq %rbp, %r13
-; AVX512VBMI-NEXT: shrdq %cl, %r15, %r13
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq 24(%rsp,%rdx), %rdi
-; AVX512VBMI-NEXT: shrdq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: andq %r12, %r13
-; AVX512VBMI-NEXT: movq 32(%rsp,%rdx), %r12
-; AVX512VBMI-NEXT: shrdq %cl, %r12, %rdi
-; AVX512VBMI-NEXT: andq %r14, %r15
-; AVX512VBMI-NEXT: movq 40(%rsp,%rdx), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %r12
-; AVX512VBMI-NEXT: andq %rbx, %rdi
-; AVX512VBMI-NEXT: movq 48(%rsp,%rdx), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r14
-; AVX512VBMI-NEXT: andq %r10, %r12
-; AVX512VBMI-NEXT: movq 56(%rsp,%rdx), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %rbx
-; AVX512VBMI-NEXT: andq %r9, %r14
-; AVX512VBMI-NEXT: movq (%rsp,%rdx), %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %rbp, %rdx
-; AVX512VBMI-NEXT: andq %r8, %rbx
-; AVX512VBMI-NEXT: andq %r11, %rdx
-; AVX512VBMI-NEXT: shrxq %rcx, %r10, %rcx
-; AVX512VBMI-NEXT: andq %rsi, %rcx
-; AVX512VBMI-NEXT: movq %rdx, (%rax)
-; AVX512VBMI-NEXT: movq %r13, 8(%rax)
-; AVX512VBMI-NEXT: movq %r15, 16(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 24(%rax)
-; AVX512VBMI-NEXT: movq %r12, 32(%rax)
-; AVX512VBMI-NEXT: movq %r14, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rax)
-; AVX512VBMI-NEXT: movq %rcx, 56(%rax)
-; AVX512VBMI-NEXT: addq $136, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
-; AVX512VBMI-NEXT: popq %rbp
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: movl $1, %esi
+; AVX512VBMI-NEXT: shlxq %rcx, %rsi, %rdi
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: shlxq %rcx, %rsi, %rcx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VBMI-NEXT: vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VBMI-NEXT: kmovd %k0, %ecx
+; AVX512VBMI-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512VBMI-NEXT: kmovb %k0, %esi
+; AVX512VBMI-NEXT: leal (%rsi,%rcx,2), %ecx
+; AVX512VBMI-NEXT: xorl %esi, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm1
+; AVX512VBMI-NEXT: movl %edx, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm3, %zmm0
+; AVX512VBMI-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -3924,220 +3516,181 @@ define i512 @bzhi_i512(i512 %a0, i512 %idx) nounwind {
;
; AVX512F-LABEL: bzhi_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
; AVX512F-NEXT: pushq %r15
; AVX512F-NEXT: pushq %r14
; AVX512F-NEXT: pushq %r13
; AVX512F-NEXT: pushq %r12
; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, %rax
-; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: movl %r10d, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %r10d
-; AVX512F-NEXT: andl $56, %r10d
-; AVX512F-NEXT: negl %r10d
-; AVX512F-NEXT: movslq %r10d, %r13
-; AVX512F-NEXT: movq -16(%rsp,%r13), %r11
-; AVX512F-NEXT: movq -8(%rsp,%r13), %r10
-; AVX512F-NEXT: shldq %cl, %r11, %r10
-; AVX512F-NEXT: movq -24(%rsp,%r13), %rbx
-; AVX512F-NEXT: shldq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -32(%rsp,%r13), %r14
-; AVX512F-NEXT: shldq %cl, %r14, %rbx
-; AVX512F-NEXT: movq -40(%rsp,%r13), %r15
-; AVX512F-NEXT: shldq %cl, %r15, %r14
-; AVX512F-NEXT: movq -48(%rsp,%r13), %r12
-; AVX512F-NEXT: shldq %cl, %r12, %r15
-; AVX512F-NEXT: movq -64(%rsp,%r13), %rbp
-; AVX512F-NEXT: movq -56(%rsp,%r13), %r13
-; AVX512F-NEXT: shldq %cl, %r13, %r12
-; AVX512F-NEXT: shldq %cl, %rbp, %r13
-; AVX512F-NEXT: shlxq %rcx, %rbp, %rcx
-; AVX512F-NEXT: addq $-1, %rcx
-; AVX512F-NEXT: adcq $-1, %r13
-; AVX512F-NEXT: adcq $-1, %r12
-; AVX512F-NEXT: adcq $-1, %r15
-; AVX512F-NEXT: adcq $-1, %r14
-; AVX512F-NEXT: adcq $-1, %rbx
-; AVX512F-NEXT: adcq $-1, %r11
-; AVX512F-NEXT: adcq $-1, %r10
-; AVX512F-NEXT: andq %r9, %r14
-; AVX512F-NEXT: andq %r8, %r15
-; AVX512F-NEXT: andq %rax, %r12
-; AVX512F-NEXT: andq %rdx, %r13
-; AVX512F-NEXT: andq %rsi, %rcx
-; AVX512F-NEXT: movq %rcx, (%rdi)
-; AVX512F-NEXT: movq %r13, 8(%rdi)
-; AVX512F-NEXT: movq %r12, 16(%rdi)
-; AVX512F-NEXT: movq %r15, 24(%rdi)
-; AVX512F-NEXT: andq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: movq %r14, 32(%rdi)
-; AVX512F-NEXT: movq %rbx, 40(%rdi)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: movl $1, %r10d
+; AVX512F-NEXT: shlxq %rax, %r10, %r11
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: shlxq %rax, %r10, %rax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpbroadcastq %r11, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = -1
+; AVX512F-NEXT: vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vpcmpltuq %zmm1, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %eax
+; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %r10d
+; AVX512F-NEXT: movzbl %r10b, %r10d
+; AVX512F-NEXT: leal (%r10,%rax,2), %eax
+; AVX512F-NEXT: xorl %r10d, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm1
+; AVX512F-NEXT: vpextrq $1, %xmm1, %r10
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2
+; AVX512F-NEXT: vpextrq $1, %xmm2, %rdi
+; AVX512F-NEXT: vmovq %xmm1, %r11
+; AVX512F-NEXT: vmovq %xmm2, %rbx
+; AVX512F-NEXT: vpextrq $1, %xmm0, %r14
+; AVX512F-NEXT: vmovq %xmm0, %r15
+; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT: vpextrq $1, %xmm0, %r12
+; AVX512F-NEXT: vmovq %xmm0, %r13
+; AVX512F-NEXT: andq %rcx, %r13
+; AVX512F-NEXT: andq %r8, %r12
+; AVX512F-NEXT: andq %rsi, %r15
+; AVX512F-NEXT: andq %rdx, %r14
+; AVX512F-NEXT: andq %r9, %rbx
+; AVX512F-NEXT: andq {{[0-9]+}}(%rsp), %rdi
; AVX512F-NEXT: andq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: movq %r11, 48(%rdi)
; AVX512F-NEXT: andq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: movq %r10, 56(%rdi)
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: addq $8, %rsp
+; AVX512F-NEXT: movq %r10, 56(%rax)
+; AVX512F-NEXT: movq %r11, 48(%rax)
+; AVX512F-NEXT: movq %rdi, 40(%rax)
+; AVX512F-NEXT: movq %rbx, 32(%rax)
+; AVX512F-NEXT: movq %r14, 8(%rax)
+; AVX512F-NEXT: movq %r15, (%rax)
+; AVX512F-NEXT: movq %r12, 24(%rax)
+; AVX512F-NEXT: movq %r13, 16(%rax)
; AVX512F-NEXT: popq %rbx
; AVX512F-NEXT: popq %r12
; AVX512F-NEXT: popq %r13
; AVX512F-NEXT: popq %r14
; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: bzhi_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
; AVX512VL-NEXT: pushq %r15
; AVX512VL-NEXT: pushq %r14
; AVX512VL-NEXT: pushq %r13
; AVX512VL-NEXT: pushq %r12
; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: movq %rcx, %rax
-; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movl %r10d, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %r10d
-; AVX512VL-NEXT: andl $56, %r10d
-; AVX512VL-NEXT: negl %r10d
-; AVX512VL-NEXT: movslq %r10d, %rbp
-; AVX512VL-NEXT: movq -16(%rsp,%rbp), %r11
-; AVX512VL-NEXT: movq -8(%rsp,%rbp), %r10
-; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -24(%rsp,%rbp), %rbx
-; AVX512VL-NEXT: shldq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -32(%rsp,%rbp), %r14
-; AVX512VL-NEXT: shldq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -40(%rsp,%rbp), %r15
-; AVX512VL-NEXT: shldq %cl, %r15, %r14
-; AVX512VL-NEXT: movq -48(%rsp,%rbp), %r12
-; AVX512VL-NEXT: shldq %cl, %r12, %r15
-; AVX512VL-NEXT: movq -56(%rsp,%rbp), %r13
-; AVX512VL-NEXT: shldq %cl, %r13, %r12
-; AVX512VL-NEXT: movq -64(%rsp,%rbp), %rbp
-; AVX512VL-NEXT: shldq %cl, %rbp, %r13
-; AVX512VL-NEXT: shlxq %rcx, %rbp, %rcx
-; AVX512VL-NEXT: addq $-1, %rcx
-; AVX512VL-NEXT: adcq $-1, %r13
-; AVX512VL-NEXT: adcq $-1, %r12
-; AVX512VL-NEXT: adcq $-1, %r15
-; AVX512VL-NEXT: adcq $-1, %r14
-; AVX512VL-NEXT: adcq $-1, %rbx
-; AVX512VL-NEXT: adcq $-1, %r11
-; AVX512VL-NEXT: adcq $-1, %r10
-; AVX512VL-NEXT: andq %r9, %r14
-; AVX512VL-NEXT: andq %r8, %r15
-; AVX512VL-NEXT: andq %rax, %r12
-; AVX512VL-NEXT: andq %rdx, %r13
-; AVX512VL-NEXT: andq %rsi, %rcx
-; AVX512VL-NEXT: movq %rcx, (%rdi)
-; AVX512VL-NEXT: movq %r13, 8(%rdi)
-; AVX512VL-NEXT: movq %r12, 16(%rdi)
-; AVX512VL-NEXT: movq %r15, 24(%rdi)
-; AVX512VL-NEXT: movq %r14, 32(%rdi)
-; AVX512VL-NEXT: andq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT: movq %rbx, 40(%rdi)
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT: movl $1, %r10d
+; AVX512VL-NEXT: shlxq %rdi, %r10, %r11
+; AVX512VL-NEXT: shrl $6, %edi
+; AVX512VL-NEXT: shlxq %rdi, %r10, %rdi
+; AVX512VL-NEXT: kmovd %edi, %k1
+; AVX512VL-NEXT: vpbroadcastq %r11, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %edi
+; AVX512VL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovb %k0, %r10d
+; AVX512VL-NEXT: leal (%r10,%rdi,2), %edi
+; AVX512VL-NEXT: xorl %r10d, %edi
+; AVX512VL-NEXT: kmovd %edi, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vextracti32x4 $3, %zmm1, %xmm0
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX512VL-NEXT: vmovq %xmm0, %r10
+; AVX512VL-NEXT: vextracti32x4 $2, %zmm1, %xmm0
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %r11
+; AVX512VL-NEXT: vmovq %xmm0, %rbx
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %r14
+; AVX512VL-NEXT: vmovq %xmm1, %r15
+; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %r12
+; AVX512VL-NEXT: vmovq %xmm0, %r13
+; AVX512VL-NEXT: andq %rcx, %r13
+; AVX512VL-NEXT: andq %r8, %r12
+; AVX512VL-NEXT: andq %rsi, %r15
+; AVX512VL-NEXT: andq %rdx, %r14
+; AVX512VL-NEXT: andq %r9, %rbx
; AVX512VL-NEXT: andq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: movq %r11, 48(%rdi)
; AVX512VL-NEXT: andq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq %r10, 56(%rdi)
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: addq $8, %rsp
+; AVX512VL-NEXT: andq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT: movq %rdi, 56(%rax)
+; AVX512VL-NEXT: movq %r10, 48(%rax)
+; AVX512VL-NEXT: movq %r11, 40(%rax)
+; AVX512VL-NEXT: movq %rbx, 32(%rax)
+; AVX512VL-NEXT: movq %r14, 8(%rax)
+; AVX512VL-NEXT: movq %r15, (%rax)
+; AVX512VL-NEXT: movq %r12, 24(%rax)
+; AVX512VL-NEXT: movq %r13, 16(%rax)
; AVX512VL-NEXT: popq %rbx
; AVX512VL-NEXT: popq %r12
; AVX512VL-NEXT: popq %r13
; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: bzhi_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rbp
; AVX512VBMI-NEXT: pushq %r15
; AVX512VBMI-NEXT: pushq %r14
; AVX512VBMI-NEXT: pushq %r13
; AVX512VBMI-NEXT: pushq %r12
; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: movq %rcx, %rax
-; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: movl %r10d, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %r10d
-; AVX512VBMI-NEXT: andl $56, %r10d
-; AVX512VBMI-NEXT: negl %r10d
-; AVX512VBMI-NEXT: movslq %r10d, %rbp
-; AVX512VBMI-NEXT: movq -16(%rsp,%rbp), %r11
-; AVX512VBMI-NEXT: movq -8(%rsp,%rbp), %r10
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -24(%rsp,%rbp), %rbx
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -32(%rsp,%rbp), %r14
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -40(%rsp,%rbp), %r15
-; AVX512VBMI-NEXT: shldq %cl, %r15, %r14
-; AVX512VBMI-NEXT: movq -48(%rsp,%rbp), %r12
-; AVX512VBMI-NEXT: shldq %cl, %r12, %r15
-; AVX512VBMI-NEXT: movq -56(%rsp,%rbp), %r13
-; AVX512VBMI-NEXT: shldq %cl, %r13, %r12
-; AVX512VBMI-NEXT: movq -64(%rsp,%rbp), %rbp
-; AVX512VBMI-NEXT: shldq %cl, %rbp, %r13
-; AVX512VBMI-NEXT: shlxq %rcx, %rbp, %rcx
-; AVX512VBMI-NEXT: addq $-1, %rcx
-; AVX512VBMI-NEXT: adcq $-1, %r13
-; AVX512VBMI-NEXT: adcq $-1, %r12
-; AVX512VBMI-NEXT: adcq $-1, %r15
-; AVX512VBMI-NEXT: adcq $-1, %r14
-; AVX512VBMI-NEXT: adcq $-1, %rbx
-; AVX512VBMI-NEXT: adcq $-1, %r11
-; AVX512VBMI-NEXT: adcq $-1, %r10
-; AVX512VBMI-NEXT: andq %r9, %r14
-; AVX512VBMI-NEXT: andq %r8, %r15
-; AVX512VBMI-NEXT: andq %rax, %r12
-; AVX512VBMI-NEXT: andq %rdx, %r13
-; AVX512VBMI-NEXT: andq %rsi, %rcx
-; AVX512VBMI-NEXT: movq %rcx, (%rdi)
-; AVX512VBMI-NEXT: movq %r13, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r12, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r15, 24(%rdi)
-; AVX512VBMI-NEXT: movq %r14, 32(%rdi)
-; AVX512VBMI-NEXT: andq {{[0-9]+}}(%rsp), %rbx
-; AVX512VBMI-NEXT: movq %rbx, 40(%rdi)
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT: movl $1, %r10d
+; AVX512VBMI-NEXT: shlxq %rdi, %r10, %r11
+; AVX512VBMI-NEXT: shrl $6, %edi
+; AVX512VBMI-NEXT: shlxq %rdi, %r10, %rdi
+; AVX512VBMI-NEXT: kmovd %edi, %k1
+; AVX512VBMI-NEXT: vpbroadcastq %r11, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VBMI-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VBMI-NEXT: kmovd %k0, %edi
+; AVX512VBMI-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT: kmovb %k0, %r10d
+; AVX512VBMI-NEXT: leal (%r10,%rdi,2), %edi
+; AVX512VBMI-NEXT: xorl %r10d, %edi
+; AVX512VBMI-NEXT: kmovd %edi, %k1
+; AVX512VBMI-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm1, %xmm0
+; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rdi
+; AVX512VBMI-NEXT: vmovq %xmm0, %r10
+; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm1, %xmm0
+; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %r11
+; AVX512VBMI-NEXT: vmovq %xmm0, %rbx
+; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %r14
+; AVX512VBMI-NEXT: vmovq %xmm1, %r15
+; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm0
+; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %r12
+; AVX512VBMI-NEXT: vmovq %xmm0, %r13
+; AVX512VBMI-NEXT: andq %rcx, %r13
+; AVX512VBMI-NEXT: andq %r8, %r12
+; AVX512VBMI-NEXT: andq %rsi, %r15
+; AVX512VBMI-NEXT: andq %rdx, %r14
+; AVX512VBMI-NEXT: andq %r9, %rbx
; AVX512VBMI-NEXT: andq {{[0-9]+}}(%rsp), %r11
-; AVX512VBMI-NEXT: movq %r11, 48(%rdi)
; AVX512VBMI-NEXT: andq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: movq %r10, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: addq $8, %rsp
+; AVX512VBMI-NEXT: andq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
+; AVX512VBMI-NEXT: movq %r10, 48(%rax)
+; AVX512VBMI-NEXT: movq %r11, 40(%rax)
+; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
+; AVX512VBMI-NEXT: movq %r14, 8(%rax)
+; AVX512VBMI-NEXT: movq %r15, (%rax)
+; AVX512VBMI-NEXT: movq %r12, 24(%rax)
+; AVX512VBMI-NEXT: movq %r13, 16(%rax)
; AVX512VBMI-NEXT: popq %rbx
; AVX512VBMI-NEXT: popq %r12
; AVX512VBMI-NEXT: popq %r13
; AVX512VBMI-NEXT: popq %r14
; AVX512VBMI-NEXT: popq %r15
-; AVX512VBMI-NEXT: popq %rbp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%bit = shl i512 1, %idx
@@ -4379,223 +3932,73 @@ define i512 @bzhi_i512_vector(<8 x i64> %v0, i512 %idx) nounwind {
;
; AVX512F-LABEL: bzhi_i512_vector:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %esi, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %esi
-; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: negl %esi
-; AVX512F-NEXT: movslq %esi, %r14
-; AVX512F-NEXT: movq -16(%rsp,%r14), %rdx
-; AVX512F-NEXT: movq -8(%rsp,%r14), %rsi
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -24(%rsp,%r14), %r8
-; AVX512F-NEXT: shldq %cl, %r8, %rdx
-; AVX512F-NEXT: movq -32(%rsp,%r14), %r9
-; AVX512F-NEXT: shldq %cl, %r9, %r8
-; AVX512F-NEXT: movq -40(%rsp,%r14), %r10
-; AVX512F-NEXT: shldq %cl, %r10, %r9
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rbx
-; AVX512F-NEXT: movq -48(%rsp,%r14), %r11
-; AVX512F-NEXT: shldq %cl, %r11, %r10
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512F-NEXT: movq -64(%rsp,%r14), %r15
-; AVX512F-NEXT: movq -56(%rsp,%r14), %rdi
-; AVX512F-NEXT: shldq %cl, %rdi, %r11
-; AVX512F-NEXT: shldq %cl, %r15, %rdi
-; AVX512F-NEXT: shlxq %rcx, %r15, %rcx
-; AVX512F-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512F-NEXT: addq $-1, %rcx
-; AVX512F-NEXT: adcq $-1, %rdi
-; AVX512F-NEXT: adcq $-1, %r11
-; AVX512F-NEXT: adcq $-1, %r10
-; AVX512F-NEXT: adcq $-1, %r9
-; AVX512F-NEXT: adcq $-1, %r8
-; AVX512F-NEXT: adcq $-1, %rdx
-; AVX512F-NEXT: adcq $-1, %rsi
-; AVX512F-NEXT: andq %r14, %rsi
-; AVX512F-NEXT: vmovq %xmm1, %r14
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT: andq %r14, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512F-NEXT: andq %r14, %r8
-; AVX512F-NEXT: vmovq %xmm1, %r14
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT: andq %r14, %r9
-; AVX512F-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512F-NEXT: andq %r14, %r10
-; AVX512F-NEXT: vmovq %xmm1, %r14
-; AVX512F-NEXT: andq %r14, %r11
-; AVX512F-NEXT: andq %rbx, %rdi
-; AVX512F-NEXT: vmovq %xmm0, %rbx
-; AVX512F-NEXT: andq %rbx, %rcx
-; AVX512F-NEXT: movq %rcx, (%rax)
-; AVX512F-NEXT: movq %rdi, 8(%rax)
-; AVX512F-NEXT: movq %r11, 16(%rax)
-; AVX512F-NEXT: movq %r10, 24(%rax)
-; AVX512F-NEXT: movq %r9, 32(%rax)
-; AVX512F-NEXT: movq %r8, 40(%rax)
-; AVX512F-NEXT: movq %rdx, 48(%rax)
-; AVX512F-NEXT: movq %rsi, 56(%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: movl $1, %ecx
+; AVX512F-NEXT: shlxq %rsi, %rcx, %rdx
+; AVX512F-NEXT: shrl $6, %esi
+; AVX512F-NEXT: shlxq %rsi, %rcx, %rcx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpbroadcastq %rdx, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512F-NEXT: vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: bzhi_i512_vector:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} xmm1 = [1,0]
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %esi
-; AVX512VL-NEXT: andl $56, %esi
-; AVX512VL-NEXT: negl %esi
-; AVX512VL-NEXT: movslq %esi, %r14
-; AVX512VL-NEXT: movq -16(%rsp,%r14), %rsi
-; AVX512VL-NEXT: movq -8(%rsp,%r14), %r8
-; AVX512VL-NEXT: shldq %cl, %rsi, %r8
-; AVX512VL-NEXT: movq -24(%rsp,%r14), %r9
-; AVX512VL-NEXT: shldq %cl, %r9, %rsi
-; AVX512VL-NEXT: movq -32(%rsp,%r14), %r10
-; AVX512VL-NEXT: shldq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -40(%rsp,%r14), %r11
-; AVX512VL-NEXT: shldq %cl, %r11, %r10
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -48(%rsp,%r14), %rdi
-; AVX512VL-NEXT: shldq %cl, %rdi, %r11
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VL-NEXT: movq -56(%rsp,%r14), %rbx
-; AVX512VL-NEXT: shldq %cl, %rbx, %rdi
-; AVX512VL-NEXT: movq -64(%rsp,%r14), %r14
-; AVX512VL-NEXT: shldq %cl, %r14, %rbx
-; AVX512VL-NEXT: shlxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512VL-NEXT: addq $-1, %rcx
-; AVX512VL-NEXT: adcq $-1, %rbx
-; AVX512VL-NEXT: adcq $-1, %rdi
-; AVX512VL-NEXT: adcq $-1, %r11
-; AVX512VL-NEXT: adcq $-1, %r10
-; AVX512VL-NEXT: adcq $-1, %r9
-; AVX512VL-NEXT: adcq $-1, %rsi
-; AVX512VL-NEXT: adcq $-1, %r8
-; AVX512VL-NEXT: andq %r14, %r8
-; AVX512VL-NEXT: vmovq %xmm1, %r14
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VL-NEXT: andq %r14, %rsi
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512VL-NEXT: andq %r14, %r9
-; AVX512VL-NEXT: vmovq %xmm1, %r14
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT: andq %r14, %r10
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512VL-NEXT: andq %r14, %r11
-; AVX512VL-NEXT: vmovq %xmm1, %r14
-; AVX512VL-NEXT: andq %r14, %rdi
-; AVX512VL-NEXT: andq %rdx, %rbx
-; AVX512VL-NEXT: vmovq %xmm0, %rdx
-; AVX512VL-NEXT: andq %rdx, %rcx
-; AVX512VL-NEXT: movq %rcx, (%rax)
-; AVX512VL-NEXT: movq %rbx, 8(%rax)
-; AVX512VL-NEXT: movq %rdi, 16(%rax)
-; AVX512VL-NEXT: movq %r11, 24(%rax)
-; AVX512VL-NEXT: movq %r10, 32(%rax)
-; AVX512VL-NEXT: movq %r9, 40(%rax)
-; AVX512VL-NEXT: movq %rsi, 48(%rax)
-; AVX512VL-NEXT: movq %r8, 56(%rax)
-; AVX512VL-NEXT: addq $8, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: vzeroupper
-; AVX512VL-NEXT: retq
-;
-; AVX512VBMI-LABEL: bzhi_i512_vector:
-; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rdx
-; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm1 = [1,0]
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %esi, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: negl %esi
-; AVX512VBMI-NEXT: movslq %esi, %r14
-; AVX512VBMI-NEXT: movq -16(%rsp,%r14), %rsi
-; AVX512VBMI-NEXT: movq -8(%rsp,%r14), %r8
-; AVX512VBMI-NEXT: shldq %cl, %rsi, %r8
-; AVX512VBMI-NEXT: movq -24(%rsp,%r14), %r9
-; AVX512VBMI-NEXT: shldq %cl, %r9, %rsi
-; AVX512VBMI-NEXT: movq -32(%rsp,%r14), %r10
-; AVX512VBMI-NEXT: shldq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -40(%rsp,%r14), %r11
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -48(%rsp,%r14), %rdi
-; AVX512VBMI-NEXT: shldq %cl, %rdi, %r11
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VBMI-NEXT: movq -56(%rsp,%r14), %rbx
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %rdi
-; AVX512VBMI-NEXT: movq -64(%rsp,%r14), %r14
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: shlxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512VBMI-NEXT: addq $-1, %rcx
-; AVX512VBMI-NEXT: adcq $-1, %rbx
-; AVX512VBMI-NEXT: adcq $-1, %rdi
-; AVX512VBMI-NEXT: adcq $-1, %r11
-; AVX512VBMI-NEXT: adcq $-1, %r10
-; AVX512VBMI-NEXT: adcq $-1, %r9
-; AVX512VBMI-NEXT: adcq $-1, %rsi
-; AVX512VBMI-NEXT: adcq $-1, %r8
-; AVX512VBMI-NEXT: andq %r14, %r8
-; AVX512VBMI-NEXT: vmovq %xmm1, %r14
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VBMI-NEXT: andq %r14, %rsi
-; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512VBMI-NEXT: andq %r14, %r9
-; AVX512VBMI-NEXT: vmovq %xmm1, %r14
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512VBMI-NEXT: andq %r14, %r10
-; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512VBMI-NEXT: andq %r14, %r11
-; AVX512VBMI-NEXT: vmovq %xmm1, %r14
-; AVX512VBMI-NEXT: andq %r14, %rdi
-; AVX512VBMI-NEXT: andq %rdx, %rbx
-; AVX512VBMI-NEXT: vmovq %xmm0, %rdx
-; AVX512VBMI-NEXT: andq %rdx, %rcx
-; AVX512VBMI-NEXT: movq %rcx, (%rax)
-; AVX512VBMI-NEXT: movq %rbx, 8(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 16(%rax)
-; AVX512VBMI-NEXT: movq %r11, 24(%rax)
-; AVX512VBMI-NEXT: movq %r10, 32(%rax)
-; AVX512VBMI-NEXT: movq %r9, 40(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 48(%rax)
-; AVX512VBMI-NEXT: movq %r8, 56(%rax)
-; AVX512VBMI-NEXT: addq $8, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
+; AVX512VL-NEXT: movl $1, %ecx
+; AVX512VL-NEXT: shlxq %rsi, %rcx, %rdx
+; AVX512VL-NEXT: shrl $6, %esi
+; AVX512VL-NEXT: shlxq %rsi, %rcx, %rcx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpbroadcastq %rdx, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT: vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512VL-NEXT: kmovb %k0, %edx
+; AVX512VL-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VL-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512VBMI-LABEL: bzhi_i512_vector:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: movl $1, %ecx
+; AVX512VBMI-NEXT: shlxq %rsi, %rcx, %rdx
+; AVX512VBMI-NEXT: shrl $6, %esi
+; AVX512VBMI-NEXT: shlxq %rsi, %rcx, %rcx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VBMI-NEXT: vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VBMI-NEXT: kmovd %k0, %ecx
+; AVX512VBMI-NEXT: vptestnmq %zmm1, %zmm1, %k0
+; AVX512VBMI-NEXT: kmovb %k0, %edx
+; AVX512VBMI-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VBMI-NEXT: xorl %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VBMI-NEXT: vpandq %zmm2, %zmm0, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = bitcast <8 x i64> %v0 to i512
@@ -4740,190 +4143,73 @@ define i512 @bzhi_i512_load(ptr %p0, i512 %idx) nounwind {
;
; AVX512F-LABEL: bzhi_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %edx, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: negl %edx
-; AVX512F-NEXT: movslq %edx, %r14
-; AVX512F-NEXT: movq -16(%rsp,%r14), %rdx
-; AVX512F-NEXT: movq -8(%rsp,%r14), %r8
-; AVX512F-NEXT: shldq %cl, %rdx, %r8
-; AVX512F-NEXT: movq -24(%rsp,%r14), %r9
-; AVX512F-NEXT: shldq %cl, %r9, %rdx
-; AVX512F-NEXT: movq -32(%rsp,%r14), %r10
-; AVX512F-NEXT: shldq %cl, %r10, %r9
-; AVX512F-NEXT: movq -40(%rsp,%r14), %r11
-; AVX512F-NEXT: shldq %cl, %r11, %r10
-; AVX512F-NEXT: movq -48(%rsp,%r14), %rbx
-; AVX512F-NEXT: shldq %cl, %rbx, %r11
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r14), %r15
-; AVX512F-NEXT: movq -56(%rsp,%r14), %rdi
-; AVX512F-NEXT: shldq %cl, %rdi, %rbx
-; AVX512F-NEXT: shldq %cl, %r15, %rdi
-; AVX512F-NEXT: shlxq %rcx, %r15, %rcx
-; AVX512F-NEXT: addq $-1, %rcx
-; AVX512F-NEXT: adcq $-1, %rdi
-; AVX512F-NEXT: adcq $-1, %rbx
-; AVX512F-NEXT: adcq $-1, %r11
-; AVX512F-NEXT: adcq $-1, %r10
-; AVX512F-NEXT: adcq $-1, %r9
-; AVX512F-NEXT: adcq $-1, %rdx
-; AVX512F-NEXT: adcq $-1, %r8
-; AVX512F-NEXT: andq 56(%rsi), %r8
-; AVX512F-NEXT: andq 48(%rsi), %rdx
-; AVX512F-NEXT: andq 40(%rsi), %r9
-; AVX512F-NEXT: andq 32(%rsi), %r10
-; AVX512F-NEXT: andq 24(%rsi), %r11
-; AVX512F-NEXT: andq 16(%rsi), %rbx
-; AVX512F-NEXT: andq 8(%rsi), %rdi
-; AVX512F-NEXT: andq (%rsi), %rcx
-; AVX512F-NEXT: movq %rcx, (%rax)
-; AVX512F-NEXT: movq %rdi, 8(%rax)
-; AVX512F-NEXT: movq %rbx, 16(%rax)
-; AVX512F-NEXT: movq %r11, 24(%rax)
-; AVX512F-NEXT: movq %r10, 32(%rax)
-; AVX512F-NEXT: movq %r9, 40(%rax)
-; AVX512F-NEXT: movq %rdx, 48(%rax)
-; AVX512F-NEXT: movq %r8, 56(%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: movl $1, %ecx
+; AVX512F-NEXT: shlxq %rdx, %rcx, %rdi
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: shlxq %rdx, %rcx, %rcx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vpandq (%rsi), %zmm1, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: bzhi_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %edx, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: negl %edx
-; AVX512VL-NEXT: movslq %edx, %rax
-; AVX512VL-NEXT: movq -16(%rsp,%rax), %rdx
-; AVX512VL-NEXT: movq -8(%rsp,%rax), %r8
-; AVX512VL-NEXT: shldq %cl, %rdx, %r8
-; AVX512VL-NEXT: movq -24(%rsp,%rax), %r9
-; AVX512VL-NEXT: shldq %cl, %r9, %rdx
-; AVX512VL-NEXT: movq -32(%rsp,%rax), %r10
-; AVX512VL-NEXT: shldq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -40(%rsp,%rax), %r11
-; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -48(%rsp,%rax), %rbx
-; AVX512VL-NEXT: shldq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -56(%rsp,%rax), %r14
-; AVX512VL-NEXT: shldq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -64(%rsp,%rax), %r15
-; AVX512VL-NEXT: shldq %cl, %r15, %r14
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shlxq %rcx, %r15, %rcx
-; AVX512VL-NEXT: addq $-1, %rcx
-; AVX512VL-NEXT: adcq $-1, %r14
-; AVX512VL-NEXT: adcq $-1, %rbx
-; AVX512VL-NEXT: adcq $-1, %r11
-; AVX512VL-NEXT: adcq $-1, %r10
-; AVX512VL-NEXT: adcq $-1, %r9
-; AVX512VL-NEXT: adcq $-1, %rdx
-; AVX512VL-NEXT: adcq $-1, %r8
-; AVX512VL-NEXT: andq 56(%rsi), %r8
-; AVX512VL-NEXT: andq 48(%rsi), %rdx
-; AVX512VL-NEXT: andq 40(%rsi), %r9
-; AVX512VL-NEXT: andq 32(%rsi), %r10
-; AVX512VL-NEXT: andq 24(%rsi), %r11
-; AVX512VL-NEXT: andq 16(%rsi), %rbx
-; AVX512VL-NEXT: andq 8(%rsi), %r14
-; AVX512VL-NEXT: andq (%rsi), %rcx
-; AVX512VL-NEXT: movq %rcx, (%rdi)
-; AVX512VL-NEXT: movq %r14, 8(%rdi)
-; AVX512VL-NEXT: movq %rbx, 16(%rdi)
-; AVX512VL-NEXT: movq %r11, 24(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r9, 40(%rdi)
-; AVX512VL-NEXT: movq %rdx, 48(%rdi)
-; AVX512VL-NEXT: movq %r8, 56(%rdi)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: movl $1, %ecx
+; AVX512VL-NEXT: shlxq %rdx, %rcx, %rdi
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: shlxq %rdx, %rcx, %rcx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovb %k0, %edx
+; AVX512VL-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vpandq (%rsi), %zmm1, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: bzhi_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %edx, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: negl %edx
-; AVX512VBMI-NEXT: movslq %edx, %rax
-; AVX512VBMI-NEXT: movq -16(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT: movq -8(%rsp,%rax), %r8
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %r8
-; AVX512VBMI-NEXT: movq -24(%rsp,%rax), %r9
-; AVX512VBMI-NEXT: shldq %cl, %r9, %rdx
-; AVX512VBMI-NEXT: movq -32(%rsp,%rax), %r10
-; AVX512VBMI-NEXT: shldq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -40(%rsp,%rax), %r11
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -48(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -56(%rsp,%rax), %r14
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -64(%rsp,%rax), %r15
-; AVX512VBMI-NEXT: shldq %cl, %r15, %r14
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shlxq %rcx, %r15, %rcx
-; AVX512VBMI-NEXT: addq $-1, %rcx
-; AVX512VBMI-NEXT: adcq $-1, %r14
-; AVX512VBMI-NEXT: adcq $-1, %rbx
-; AVX512VBMI-NEXT: adcq $-1, %r11
-; AVX512VBMI-NEXT: adcq $-1, %r10
-; AVX512VBMI-NEXT: adcq $-1, %r9
-; AVX512VBMI-NEXT: adcq $-1, %rdx
-; AVX512VBMI-NEXT: adcq $-1, %r8
-; AVX512VBMI-NEXT: andq 56(%rsi), %r8
-; AVX512VBMI-NEXT: andq 48(%rsi), %rdx
-; AVX512VBMI-NEXT: andq 40(%rsi), %r9
-; AVX512VBMI-NEXT: andq 32(%rsi), %r10
-; AVX512VBMI-NEXT: andq 24(%rsi), %r11
-; AVX512VBMI-NEXT: andq 16(%rsi), %rbx
-; AVX512VBMI-NEXT: andq 8(%rsi), %r14
-; AVX512VBMI-NEXT: andq (%rsi), %rcx
-; AVX512VBMI-NEXT: movq %rcx, (%rdi)
-; AVX512VBMI-NEXT: movq %r14, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 24(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 40(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 56(%rdi)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: movl $1, %ecx
+; AVX512VBMI-NEXT: shlxq %rdx, %rcx, %rdi
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: shlxq %rdx, %rcx, %rcx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VBMI-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VBMI-NEXT: kmovd %k0, %ecx
+; AVX512VBMI-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT: kmovb %k0, %edx
+; AVX512VBMI-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VBMI-NEXT: xorl %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vpandq (%rsi), %zmm1, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -5183,120 +4469,81 @@ define i512 @isolate_msb_i512(i512 %a0, i512 %idx) nounwind {
;
; AVX512F-LABEL: isolate_msb_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: subq $24, %rsp
-; AVX512F-NEXT: movq %r8, %r12
-; AVX512F-NEXT: movq %rcx, %r8
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-NEXT: vmovq %rdx, %xmm0
-; AVX512F-NEXT: vmovq %r12, %xmm1
+; AVX512F-NEXT: orq %rax, %rdx
+; AVX512F-NEXT: vmovq %r8, %xmm1
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r8
+; AVX512F-NEXT: orq %rdx, %r8
; AVX512F-NEXT: vmovq %rsi, %xmm2
+; AVX512F-NEXT: orq %r9, %rsi
; AVX512F-NEXT: vmovq %rcx, %xmm3
+; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT: orq %rsi, %rcx
+; AVX512F-NEXT: xorl %edx, %edx
+; AVX512F-NEXT: orq %r8, %rcx
+; AVX512F-NEXT: sete %dl
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512F-NEXT: vmovq %r9, %xmm2
-; AVX512F-NEXT: vmovq %r10, %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT: vmovq %r9, %xmm1
+; AVX512F-NEXT: vmovq %rax, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovd %xmm0, %r13d
-; AVX512F-NEXT: movl %r13d, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %r13d
-; AVX512F-NEXT: andl $56, %r13d
-; AVX512F-NEXT: movq -56(%rsp,%r13), %rax
-; AVX512F-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: movq -64(%rsp,%r13), %r14
-; AVX512F-NEXT: movq %r14, %r11
-; AVX512F-NEXT: shrdq %cl, %rax, %r11
-; AVX512F-NEXT: movq -72(%rsp,%r13), %r15
-; AVX512F-NEXT: movq %r15, %rbx
-; AVX512F-NEXT: shrdq %cl, %r14, %rbx
-; AVX512F-NEXT: movq -80(%rsp,%r13), %rbp
-; AVX512F-NEXT: movq %rbp, %r14
-; AVX512F-NEXT: shrdq %cl, %r15, %r14
-; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT: movq -88(%rsp,%r13), %rax
-; AVX512F-NEXT: movq %rax, %r15
-; AVX512F-NEXT: shrdq %cl, %rbp, %r15
-; AVX512F-NEXT: orq %r10, %rdx
-; AVX512F-NEXT: orq %rdx, %r12
-; AVX512F-NEXT: orq %r9, %rsi
-; AVX512F-NEXT: movq -96(%rsp,%r13), %rbp
-; AVX512F-NEXT: movq %rbp, %rdx
-; AVX512F-NEXT: shrdq %cl, %rax, %rdx
-; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r8
-; AVX512F-NEXT: movq -112(%rsp,%r13), %r9
-; AVX512F-NEXT: movq -104(%rsp,%r13), %rax
-; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: shrdq %cl, %rbp, %r10
-; AVX512F-NEXT: shrdq %cl, %rax, %r9
-; AVX512F-NEXT: orq %rsi, %r8
-; AVX512F-NEXT: xorl %eax, %eax
-; AVX512F-NEXT: orq %r12, %r8
-; AVX512F-NEXT: shrxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512F-NEXT: cmoveq %rax, %r10
-; AVX512F-NEXT: cmoveq %rax, %rdx
-; AVX512F-NEXT: cmoveq %rax, %r15
-; AVX512F-NEXT: cmoveq %rax, %r14
-; AVX512F-NEXT: cmoveq %rax, %rbx
-; AVX512F-NEXT: cmoveq %rax, %r11
-; AVX512F-NEXT: cmoveq %rax, %r9
-; AVX512F-NEXT: cmoveq %rax, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %r11, 48(%rdi)
-; AVX512F-NEXT: movq %rbx, 40(%rdi)
-; AVX512F-NEXT: movq %r14, 32(%rdi)
-; AVX512F-NEXT: movq %r15, 24(%rdi)
-; AVX512F-NEXT: movq %rdx, 16(%rdi)
-; AVX512F-NEXT: movq %r10, 8(%rdi)
-; AVX512F-NEXT: movq %r9, (%rdi)
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: movl %eax, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %eax, %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: addq $24, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512F-NEXT: negl %edx
+; AVX512F-NEXT: kmovw %edx, %k0
+; AVX512F-NEXT: knotw %k0, %k1
+; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: isolate_msb_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq %rcx, %rax
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: vmovq %r8, %xmm0
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT: vmovq %rdx, %xmm0
+; AVX512VL-NEXT: vmovq %r8, %xmm1
; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r8
-; AVX512VL-NEXT: vmovq %rdx, %xmm1
+; AVX512VL-NEXT: orq %rax, %rdx
+; AVX512VL-NEXT: orq %rdx, %r8
; AVX512VL-NEXT: vmovq %rsi, %xmm2
; AVX512VL-NEXT: vmovq %rcx, %xmm3
-; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rcx
+; AVX512VL-NEXT: orq %r9, %rsi
+; AVX512VL-NEXT: orq %rsi, %rcx
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: orq %r8, %rcx
+; AVX512VL-NEXT: sete %dl
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
; AVX512VL-NEXT: vmovq %r9, %xmm2
-; AVX512VL-NEXT: vmovq %r10, %xmm3
+; AVX512VL-NEXT: vmovq %rax, %xmm3
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -5304,159 +4551,78 @@ define i512 @isolate_msb_i512(i512 %a0, i512 %idx) nounwind {
; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: movl %eax, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %eax, %ecx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm0
; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovd %xmm0, %r14d
-; AVX512VL-NEXT: movl %r14d, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %r14d
-; AVX512VL-NEXT: andl $56, %r14d
-; AVX512VL-NEXT: movq -72(%rsp,%r14), %r15
-; AVX512VL-NEXT: movq -80(%rsp,%r14), %r12
-; AVX512VL-NEXT: movq %r12, %r11
-; AVX512VL-NEXT: shrdq %cl, %r15, %r11
-; AVX512VL-NEXT: movq -88(%rsp,%r14), %r13
-; AVX512VL-NEXT: movq %r13, %rbx
-; AVX512VL-NEXT: shrdq %cl, %r12, %rbx
-; AVX512VL-NEXT: orq %r10, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%r14), %r12
-; AVX512VL-NEXT: movq %r12, %r10
-; AVX512VL-NEXT: shrdq %cl, %r13, %r10
-; AVX512VL-NEXT: orq %rdx, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%r14), %r13
-; AVX512VL-NEXT: movq %r13, %rdx
-; AVX512VL-NEXT: shrdq %cl, %r12, %rdx
-; AVX512VL-NEXT: orq %r9, %rsi
-; AVX512VL-NEXT: movq -112(%rsp,%r14), %r12
-; AVX512VL-NEXT: movq %r12, %r9
-; AVX512VL-NEXT: shrdq %cl, %r13, %r9
-; AVX512VL-NEXT: orq %rsi, %rax
-; AVX512VL-NEXT: movq -120(%rsp,%r14), %r13
-; AVX512VL-NEXT: movq %r13, %rsi
-; AVX512VL-NEXT: shrdq %cl, %r12, %rsi
-; AVX512VL-NEXT: movq -128(%rsp,%r14), %r14
-; AVX512VL-NEXT: shrdq %cl, %r13, %r14
-; AVX512VL-NEXT: xorl %r12d, %r12d
-; AVX512VL-NEXT: orq %r8, %rax
-; AVX512VL-NEXT: shrxq %rcx, %r15, %rax
-; AVX512VL-NEXT: cmoveq %r12, %rsi
-; AVX512VL-NEXT: cmoveq %r12, %r9
-; AVX512VL-NEXT: cmoveq %r12, %rdx
-; AVX512VL-NEXT: cmoveq %r12, %r10
-; AVX512VL-NEXT: cmoveq %r12, %rbx
-; AVX512VL-NEXT: cmoveq %r12, %r11
-; AVX512VL-NEXT: cmoveq %r12, %r14
-; AVX512VL-NEXT: cmoveq %r12, %rax
-; AVX512VL-NEXT: movq %rax, 56(%rdi)
-; AVX512VL-NEXT: movq %r11, 48(%rdi)
-; AVX512VL-NEXT: movq %rbx, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %rdx, 24(%rdi)
-; AVX512VL-NEXT: movq %r9, 16(%rdi)
-; AVX512VL-NEXT: movq %rsi, 8(%rdi)
-; AVX512VL-NEXT: movq %r14, (%rdi)
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: negl %edx
+; AVX512VL-NEXT: kmovd %edx, %k0
+; AVX512VL-NEXT: knotw %k0, %k1
+; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: isolate_msb_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq %rcx, %rax
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: vmovq %r8, %xmm0
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT: vmovq %rdx, %xmm0
+; AVX512VBMI-NEXT: orq %rdi, %rdx
+; AVX512VBMI-NEXT: vmovq %r8, %xmm1
; AVX512VBMI-NEXT: orq {{[0-9]+}}(%rsp), %r8
-; AVX512VBMI-NEXT: vmovq %rdx, %xmm1
+; AVX512VBMI-NEXT: orq %rdx, %r8
; AVX512VBMI-NEXT: vmovq %rsi, %xmm2
+; AVX512VBMI-NEXT: orq %r9, %rsi
; AVX512VBMI-NEXT: vmovq %rcx, %xmm3
-; AVX512VBMI-NEXT: orq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512VBMI-NEXT: vmovq %r9, %xmm2
-; AVX512VBMI-NEXT: vmovq %r10, %xmm3
-; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VBMI-NEXT: orq {{[0-9]+}}(%rsp), %rcx
+; AVX512VBMI-NEXT: orq %rsi, %rcx
+; AVX512VBMI-NEXT: xorl %edx, %edx
+; AVX512VBMI-NEXT: orq %r8, %rcx
+; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512VBMI-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VBMI-NEXT: vmovq %r9, %xmm1
+; AVX512VBMI-NEXT: vmovq %rdi, %xmm3
+; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VBMI-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VBMI-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
; AVX512VBMI-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovd %xmm0, %r14d
-; AVX512VBMI-NEXT: movl %r14d, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %r14d
-; AVX512VBMI-NEXT: andl $56, %r14d
-; AVX512VBMI-NEXT: movq -72(%rsp,%r14), %r15
-; AVX512VBMI-NEXT: movq -80(%rsp,%r14), %r12
-; AVX512VBMI-NEXT: movq %r12, %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r15, %r11
-; AVX512VBMI-NEXT: movq -88(%rsp,%r14), %r13
-; AVX512VBMI-NEXT: movq %r13, %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %r12, %rbx
-; AVX512VBMI-NEXT: orq %r10, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%r14), %r12
-; AVX512VBMI-NEXT: movq %r12, %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r13, %r10
-; AVX512VBMI-NEXT: orq %rdx, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%r14), %r13
-; AVX512VBMI-NEXT: movq %r13, %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %r12, %rdx
-; AVX512VBMI-NEXT: orq %r9, %rsi
-; AVX512VBMI-NEXT: movq -112(%rsp,%r14), %r12
-; AVX512VBMI-NEXT: movq %r12, %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r13, %r9
-; AVX512VBMI-NEXT: orq %rsi, %rax
-; AVX512VBMI-NEXT: movq -120(%rsp,%r14), %r13
-; AVX512VBMI-NEXT: movq %r13, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %r12, %rsi
-; AVX512VBMI-NEXT: movq -128(%rsp,%r14), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r13, %r14
-; AVX512VBMI-NEXT: xorl %r12d, %r12d
-; AVX512VBMI-NEXT: orq %r8, %rax
-; AVX512VBMI-NEXT: shrxq %rcx, %r15, %rax
-; AVX512VBMI-NEXT: cmoveq %r12, %rsi
-; AVX512VBMI-NEXT: cmoveq %r12, %r9
-; AVX512VBMI-NEXT: cmoveq %r12, %rdx
-; AVX512VBMI-NEXT: cmoveq %r12, %r10
-; AVX512VBMI-NEXT: cmoveq %r12, %rbx
-; AVX512VBMI-NEXT: cmoveq %r12, %r11
-; AVX512VBMI-NEXT: cmoveq %r12, %r14
-; AVX512VBMI-NEXT: cmoveq %r12, %rax
-; AVX512VBMI-NEXT: movq %rax, 56(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 48(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 24(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 16(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r14, (%rdi)
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512VBMI-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: sete %dl
+; AVX512VBMI-NEXT: vmovq %xmm0, %rcx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %esi
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512VBMI-NEXT: vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT: negl %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k0
+; AVX512VBMI-NEXT: knotw %k0, %k1
+; AVX512VBMI-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%eqz = icmp eq i512 %a0, 0
@@ -5828,208 +4994,109 @@ define i512 @isolate_msb_i512_vector(<8 x i64> %v0, i512 %idx) nounwind {
;
; AVX512F-LABEL: isolate_msb_i512_vector:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm2 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT: vmovups %zmm2, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm1
-; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
-; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
-; AVX512F-NEXT: vmovd %xmm1, %ebx
-; AVX512F-NEXT: movl %ebx, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %ebx
-; AVX512F-NEXT: andl $56, %ebx
-; AVX512F-NEXT: movq -72(%rsp,%rbx), %r10
-; AVX512F-NEXT: movq -80(%rsp,%rbx), %rax
-; AVX512F-NEXT: movq %rax, %rdx
-; AVX512F-NEXT: shrdq %cl, %r10, %rdx
-; AVX512F-NEXT: movq -88(%rsp,%rbx), %r9
-; AVX512F-NEXT: movq %r9, %rsi
-; AVX512F-NEXT: shrdq %cl, %rax, %rsi
-; AVX512F-NEXT: movq -96(%rsp,%rbx), %rax
-; AVX512F-NEXT: movq %rax, %r8
-; AVX512F-NEXT: shrdq %cl, %r9, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rbx), %r14
-; AVX512F-NEXT: movq %r14, %r9
-; AVX512F-NEXT: shrdq %cl, %rax, %r9
-; AVX512F-NEXT: movq -112(%rsp,%rbx), %r15
-; AVX512F-NEXT: movq %r15, %r11
-; AVX512F-NEXT: shrdq %cl, %r14, %r11
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -128(%rsp,%rbx), %rdi
-; AVX512F-NEXT: movq -120(%rsp,%rbx), %r14
-; AVX512F-NEXT: movq %r14, %rbx
-; AVX512F-NEXT: shrdq %cl, %r15, %rbx
-; AVX512F-NEXT: shrdq %cl, %r14, %rdi
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: xorl %r14d, %r14d
+; AVX512F-NEXT: xorl %ecx, %ecx
; AVX512F-NEXT: kortestw %k0, %k0
-; AVX512F-NEXT: shrxq %rcx, %r10, %rcx
-; AVX512F-NEXT: cmoveq %r14, %rbx
-; AVX512F-NEXT: cmoveq %r14, %r11
-; AVX512F-NEXT: cmoveq %r14, %r9
-; AVX512F-NEXT: cmoveq %r14, %r8
-; AVX512F-NEXT: cmoveq %r14, %rsi
-; AVX512F-NEXT: cmoveq %r14, %rdx
-; AVX512F-NEXT: cmoveq %r14, %rdi
-; AVX512F-NEXT: cmoveq %r14, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rax)
-; AVX512F-NEXT: movq %rdx, 48(%rax)
-; AVX512F-NEXT: movq %rsi, 40(%rax)
-; AVX512F-NEXT: movq %r8, 32(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %r11, 16(%rax)
-; AVX512F-NEXT: movq %rbx, 8(%rax)
-; AVX512F-NEXT: movq %rdi, (%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: sete %cl
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: movl %eax, %edx
+; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %eax, %edx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512F-NEXT: negl %ecx
+; AVX512F-NEXT: kmovw %ecx, %k0
+; AVX512F-NEXT: knotw %k0, %k1
+; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: isolate_msb_i512_vector:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT: vpermq %zmm0, %zmm2, %zmm2
-; AVX512VL-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512VL-NEXT: vplzcntq %zmm2, %zmm2
-; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT: vpcompressq %zmm2, %zmm2 {%k1}
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovd %xmm2, %r10d
-; AVX512VL-NEXT: movl %r10d, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %r10d
-; AVX512VL-NEXT: andl $56, %r10d
-; AVX512VL-NEXT: movq -72(%rsp,%r10), %r11
-; AVX512VL-NEXT: movq -80(%rsp,%r10), %rax
-; AVX512VL-NEXT: movq %rax, %rdx
-; AVX512VL-NEXT: shrdq %cl, %r11, %rdx
-; AVX512VL-NEXT: movq -88(%rsp,%r10), %r9
-; AVX512VL-NEXT: movq %r9, %rsi
-; AVX512VL-NEXT: shrdq %cl, %rax, %rsi
-; AVX512VL-NEXT: movq -96(%rsp,%r10), %rax
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: shrdq %cl, %r9, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%r10), %rbx
-; AVX512VL-NEXT: movq %rbx, %r9
-; AVX512VL-NEXT: shrdq %cl, %rax, %r9
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -112(%rsp,%r10), %r14
-; AVX512VL-NEXT: movq %r14, %rdi
-; AVX512VL-NEXT: shrdq %cl, %rbx, %rdi
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512VL-NEXT: movq -120(%rsp,%r10), %r15
-; AVX512VL-NEXT: movq %r15, %rbx
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%r10), %r10
-; AVX512VL-NEXT: shrdq %cl, %r15, %r10
-; AVX512VL-NEXT: xorl %r14d, %r14d
+; AVX512VL-NEXT: xorl %ecx, %ecx
; AVX512VL-NEXT: kortestw %k0, %k0
-; AVX512VL-NEXT: shrxq %rcx, %r11, %rcx
-; AVX512VL-NEXT: cmoveq %r14, %rbx
-; AVX512VL-NEXT: cmoveq %r14, %rdi
-; AVX512VL-NEXT: cmoveq %r14, %r9
-; AVX512VL-NEXT: cmoveq %r14, %r8
-; AVX512VL-NEXT: cmoveq %r14, %rsi
-; AVX512VL-NEXT: cmoveq %r14, %rdx
-; AVX512VL-NEXT: cmoveq %r14, %r10
-; AVX512VL-NEXT: cmoveq %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rax)
-; AVX512VL-NEXT: movq %rdx, 48(%rax)
-; AVX512VL-NEXT: movq %rsi, 40(%rax)
-; AVX512VL-NEXT: movq %r8, 32(%rax)
-; AVX512VL-NEXT: movq %r9, 24(%rax)
-; AVX512VL-NEXT: movq %rdi, 16(%rax)
-; AVX512VL-NEXT: movq %rbx, 8(%rax)
-; AVX512VL-NEXT: movq %r10, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: sete %cl
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: movl %eax, %edx
+; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %eax, %edx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: negl %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k0
+; AVX512VL-NEXT: knotw %k0, %k1
+; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: isolate_msb_i512_vector:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
-; AVX512VBMI-NEXT: vpermq %zmm0, %zmm2, %zmm2
-; AVX512VBMI-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512VBMI-NEXT: vplzcntq %zmm2, %zmm2
-; AVX512VBMI-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512VBMI-NEXT: vpcompressq %zmm2, %zmm2 {%k1}
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovd %xmm2, %r10d
-; AVX512VBMI-NEXT: movl %r10d, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %r10d
-; AVX512VBMI-NEXT: andl $56, %r10d
-; AVX512VBMI-NEXT: movq -72(%rsp,%r10), %r11
-; AVX512VBMI-NEXT: movq -80(%rsp,%r10), %rax
-; AVX512VBMI-NEXT: movq %rax, %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: movq -88(%rsp,%r10), %r9
-; AVX512VBMI-NEXT: movq %r9, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi
-; AVX512VBMI-NEXT: movq -96(%rsp,%r10), %rax
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%r10), %rbx
-; AVX512VBMI-NEXT: movq %rbx, %r9
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %r9
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -112(%rsp,%r10), %r14
-; AVX512VBMI-NEXT: movq %r14, %rdi
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %rdi
; AVX512VBMI-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512VBMI-NEXT: movq -120(%rsp,%r10), %r15
-; AVX512VBMI-NEXT: movq %r15, %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%r10), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r15, %r10
-; AVX512VBMI-NEXT: xorl %r14d, %r14d
+; AVX512VBMI-NEXT: xorl %ecx, %ecx
; AVX512VBMI-NEXT: kortestw %k0, %k0
-; AVX512VBMI-NEXT: shrxq %rcx, %r11, %rcx
-; AVX512VBMI-NEXT: cmoveq %r14, %rbx
-; AVX512VBMI-NEXT: cmoveq %r14, %rdi
-; AVX512VBMI-NEXT: cmoveq %r14, %r9
-; AVX512VBMI-NEXT: cmoveq %r14, %r8
-; AVX512VBMI-NEXT: cmoveq %r14, %rsi
-; AVX512VBMI-NEXT: cmoveq %r14, %rdx
-; AVX512VBMI-NEXT: cmoveq %r14, %r10
-; AVX512VBMI-NEXT: cmoveq %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 48(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 40(%rax)
-; AVX512VBMI-NEXT: movq %r8, 32(%rax)
-; AVX512VBMI-NEXT: movq %r9, 24(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 16(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 8(%rax)
-; AVX512VBMI-NEXT: movq %r10, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VBMI-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VBMI-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512VBMI-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: sete %cl
+; AVX512VBMI-NEXT: vmovq %xmm0, %rdx
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: movl $-1, %esi
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512VBMI-NEXT: vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT: negl %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k0
+; AVX512VBMI-NEXT: knotw %k0, %k1
+; AVX512VBMI-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = bitcast <8 x i64> %v0 to i512
@@ -6300,211 +5367,112 @@ define i512 @isolate_msb_i512_load(ptr %p0, i512 %idx) nounwind {
;
; AVX512F-LABEL: isolate_msb_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm2 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT: vmovups %zmm2, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm1
-; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
-; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vmovd %xmm1, %ebx
-; AVX512F-NEXT: movl %ebx, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %ebx
-; AVX512F-NEXT: andl $56, %ebx
-; AVX512F-NEXT: movq -72(%rsp,%rbx), %r10
-; AVX512F-NEXT: movq -80(%rsp,%rbx), %rax
-; AVX512F-NEXT: movq %rax, %rdx
-; AVX512F-NEXT: shrdq %cl, %r10, %rdx
-; AVX512F-NEXT: movq -88(%rsp,%rbx), %r9
-; AVX512F-NEXT: movq %r9, %rsi
-; AVX512F-NEXT: shrdq %cl, %rax, %rsi
-; AVX512F-NEXT: movq -96(%rsp,%rbx), %rax
-; AVX512F-NEXT: movq %rax, %r8
-; AVX512F-NEXT: shrdq %cl, %r9, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rbx), %r14
-; AVX512F-NEXT: movq %r14, %r9
-; AVX512F-NEXT: shrdq %cl, %rax, %r9
-; AVX512F-NEXT: movq -112(%rsp,%rbx), %r15
-; AVX512F-NEXT: movq %r15, %r11
-; AVX512F-NEXT: shrdq %cl, %r14, %r11
-; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -128(%rsp,%rbx), %rdi
-; AVX512F-NEXT: movq -120(%rsp,%rbx), %r14
-; AVX512F-NEXT: movq %r14, %rbx
-; AVX512F-NEXT: shrdq %cl, %r15, %rbx
-; AVX512F-NEXT: shrdq %cl, %r14, %rdi
; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT: xorl %r14d, %r14d
+; AVX512F-NEXT: xorl %ecx, %ecx
; AVX512F-NEXT: kortestw %k0, %k0
-; AVX512F-NEXT: shrxq %rcx, %r10, %rcx
-; AVX512F-NEXT: cmoveq %r14, %rbx
-; AVX512F-NEXT: cmoveq %r14, %r11
-; AVX512F-NEXT: cmoveq %r14, %r9
-; AVX512F-NEXT: cmoveq %r14, %r8
-; AVX512F-NEXT: cmoveq %r14, %rsi
-; AVX512F-NEXT: cmoveq %r14, %rdx
-; AVX512F-NEXT: cmoveq %r14, %rdi
-; AVX512F-NEXT: cmoveq %r14, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rax)
-; AVX512F-NEXT: movq %rdx, 48(%rax)
-; AVX512F-NEXT: movq %rsi, 40(%rax)
-; AVX512F-NEXT: movq %r8, 32(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %r11, 16(%rax)
-; AVX512F-NEXT: movq %rbx, 8(%rax)
-; AVX512F-NEXT: movq %rdi, (%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: sete %cl
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovq %xmm0, %rax
+; AVX512F-NEXT: movl %eax, %edx
+; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %eax, %edx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512F-NEXT: negl %ecx
+; AVX512F-NEXT: kmovw %ecx, %k0
+; AVX512F-NEXT: knotw %k0, %k1
+; AVX512F-NEXT: vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: isolate_msb_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT: vpermq %zmm0, %zmm2, %zmm2
-; AVX512VL-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512VL-NEXT: vplzcntq %zmm2, %zmm2
-; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT: vpcompressq %zmm2, %zmm2 {%k1} {z}
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovd %xmm2, %r10d
-; AVX512VL-NEXT: movl %r10d, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %r10d
-; AVX512VL-NEXT: andl $56, %r10d
-; AVX512VL-NEXT: movq -72(%rsp,%r10), %r11
-; AVX512VL-NEXT: movq -80(%rsp,%r10), %rax
-; AVX512VL-NEXT: movq %rax, %rdx
-; AVX512VL-NEXT: shrdq %cl, %r11, %rdx
-; AVX512VL-NEXT: movq -88(%rsp,%r10), %r9
-; AVX512VL-NEXT: movq %r9, %rsi
-; AVX512VL-NEXT: shrdq %cl, %rax, %rsi
-; AVX512VL-NEXT: movq -96(%rsp,%r10), %rax
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: shrdq %cl, %r9, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%r10), %rbx
-; AVX512VL-NEXT: movq %rbx, %r9
-; AVX512VL-NEXT: shrdq %cl, %rax, %r9
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -112(%rsp,%r10), %r14
-; AVX512VL-NEXT: movq %r14, %rdi
-; AVX512VL-NEXT: shrdq %cl, %rbx, %rdi
; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512VL-NEXT: movq -120(%rsp,%r10), %r15
-; AVX512VL-NEXT: movq %r15, %rbx
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%r10), %r10
-; AVX512VL-NEXT: shrdq %cl, %r15, %r10
-; AVX512VL-NEXT: xorl %r14d, %r14d
+; AVX512VL-NEXT: xorl %ecx, %ecx
; AVX512VL-NEXT: kortestw %k0, %k0
-; AVX512VL-NEXT: shrxq %rcx, %r11, %rcx
-; AVX512VL-NEXT: cmoveq %r14, %rbx
-; AVX512VL-NEXT: cmoveq %r14, %rdi
-; AVX512VL-NEXT: cmoveq %r14, %r9
-; AVX512VL-NEXT: cmoveq %r14, %r8
-; AVX512VL-NEXT: cmoveq %r14, %rsi
-; AVX512VL-NEXT: cmoveq %r14, %rdx
-; AVX512VL-NEXT: cmoveq %r14, %r10
-; AVX512VL-NEXT: cmoveq %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rax)
-; AVX512VL-NEXT: movq %rdx, 48(%rax)
-; AVX512VL-NEXT: movq %rsi, 40(%rax)
-; AVX512VL-NEXT: movq %r8, 32(%rax)
-; AVX512VL-NEXT: movq %r9, 24(%rax)
-; AVX512VL-NEXT: movq %rdi, 16(%rax)
-; AVX512VL-NEXT: movq %rbx, 8(%rax)
-; AVX512VL-NEXT: movq %r10, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: sete %cl
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovq %xmm0, %rax
+; AVX512VL-NEXT: movl %eax, %edx
+; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %eax, %edx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: negl %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k0
+; AVX512VL-NEXT: knotw %k0, %k1
+; AVX512VL-NEXT: vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: isolate_msb_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
-; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
-; AVX512VBMI-NEXT: vpermq %zmm0, %zmm2, %zmm2
-; AVX512VBMI-NEXT: vptestmq %zmm2, %zmm2, %k1
-; AVX512VBMI-NEXT: vplzcntq %zmm2, %zmm2
-; AVX512VBMI-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512VBMI-NEXT: vpcompressq %zmm2, %zmm2 {%k1} {z}
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovd %xmm2, %r10d
-; AVX512VBMI-NEXT: movl %r10d, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %r10d
-; AVX512VBMI-NEXT: andl $56, %r10d
-; AVX512VBMI-NEXT: movq -72(%rsp,%r10), %r11
-; AVX512VBMI-NEXT: movq -80(%rsp,%r10), %rax
-; AVX512VBMI-NEXT: movq %rax, %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: movq -88(%rsp,%r10), %r9
-; AVX512VBMI-NEXT: movq %r9, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi
-; AVX512VBMI-NEXT: movq -96(%rsp,%r10), %rax
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%r10), %rbx
-; AVX512VBMI-NEXT: movq %rbx, %r9
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %r9
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -112(%rsp,%r10), %r14
-; AVX512VBMI-NEXT: movq %r14, %rdi
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %rdi
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
; AVX512VBMI-NEXT: vptestmd %zmm0, %zmm0, %k0
-; AVX512VBMI-NEXT: movq -120(%rsp,%r10), %r15
-; AVX512VBMI-NEXT: movq %r15, %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%r10), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r15, %r10
-; AVX512VBMI-NEXT: xorl %r14d, %r14d
+; AVX512VBMI-NEXT: xorl %ecx, %ecx
; AVX512VBMI-NEXT: kortestw %k0, %k0
-; AVX512VBMI-NEXT: shrxq %rcx, %r11, %rcx
-; AVX512VBMI-NEXT: cmoveq %r14, %rbx
-; AVX512VBMI-NEXT: cmoveq %r14, %rdi
-; AVX512VBMI-NEXT: cmoveq %r14, %r9
-; AVX512VBMI-NEXT: cmoveq %r14, %r8
-; AVX512VBMI-NEXT: cmoveq %r14, %rsi
-; AVX512VBMI-NEXT: cmoveq %r14, %rdx
-; AVX512VBMI-NEXT: cmoveq %r14, %r10
-; AVX512VBMI-NEXT: cmoveq %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 48(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 40(%rax)
-; AVX512VBMI-NEXT: movq %r8, 32(%rax)
-; AVX512VBMI-NEXT: movq %r9, 24(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 16(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 8(%rax)
-; AVX512VBMI-NEXT: movq %r10, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VBMI-NEXT: vpermq %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: vptestmq %zmm0, %zmm0, %k1
+; AVX512VBMI-NEXT: vplzcntq %zmm0, %zmm0
+; AVX512VBMI-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: sete %cl
+; AVX512VBMI-NEXT: vmovq %xmm0, %rdx
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: movl $-1, %esi
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpbroadcastq %xmm0, %zmm0
+; AVX512VBMI-NEXT: vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT: negl %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k0
+; AVX512VBMI-NEXT: knotw %k0, %k1
+; AVX512VBMI-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index f60585e978104..c8ac18d1d309a 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -133,68 +133,42 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
;
; AVX512F-LABEL: shl_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
; AVX512F-NEXT: negl %eax
-; AVX512F-NEXT: movslq %eax, %r8
-; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %r9
-; AVX512F-NEXT: shldq %cl, %rax, %r9
-; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r11
-; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %rbx
-; AVX512F-NEXT: shldq %cl, %rax, %rbx
-; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: shldq %cl, %r10, %r14
-; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
-; AVX512F-NEXT: shldq %cl, %rax, %r10
+; AVX512F-NEXT: cltq
+; AVX512F-NEXT: vmovdqu64 -64(%rsp,%rax), %zmm2
+; AVX512F-NEXT: vpsllq %xmm1, %zmm2, %zmm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[7],zmm2[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
-; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdi, %rdx
-; AVX512F-NEXT: movq %r10, 56(%rax)
-; AVX512F-NEXT: movq %r14, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 40(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %rsi, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %r8, (%rax)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shl_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: pushq %rax
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
@@ -210,53 +184,31 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
; AVX512VL-NEXT: shrl $3, %eax
; AVX512VL-NEXT: andl $56, %eax
; AVX512VL-NEXT: negl %eax
-; AVX512VL-NEXT: movslq %eax, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VL-NEXT: movq %rax, %rsi
-; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VL-NEXT: movq %r10, %r8
-; AVX512VL-NEXT: shldq %cl, %rax, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq %r11, %rbx
-; AVX512VL-NEXT: shldq %cl, %r10, %rbx
+; AVX512VL-NEXT: cltq
+; AVX512VL-NEXT: vmovdqu64 -64(%rsp,%rax), %zmm1
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm3[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VL-NEXT: movq %rdi, %r10
-; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VL-NEXT: movq %r14, %r15
-; AVX512VL-NEXT: shldq %cl, %rdi, %r15
-; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VL-NEXT: shldq %cl, %r14, %rdi
-; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %r11, %rdx
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
-; AVX512VL-NEXT: movq %r15, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %rsi, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %r9, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: popq %rcx
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: shl_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
@@ -265,50 +217,23 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %eax, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %eax
-; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: negl %eax
-; AVX512VBMI-NEXT: movslq %eax, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VBMI-NEXT: movq %rax, %rsi
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VBMI-NEXT: movq %r10, %r8
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq %r11, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: movq %rdi, %r10
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VBMI-NEXT: movq %r14, %r15
-; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
-; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT: movq %r15, 48(%rax)
-; AVX512VBMI-NEXT: movq %r10, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT: movq %r8, 24(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
-; AVX512VBMI-NEXT: movq %r9, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT: movl %edi, %ecx
+; AVX512VBMI-NEXT: shrl $3, %ecx
+; AVX512VBMI-NEXT: andl $56, %ecx
+; AVX512VBMI-NEXT: negl %ecx
+; AVX512VBMI-NEXT: movslq %ecx, %rcx
+; AVX512VBMI-NEXT: vmovdqu64 -64(%rsp,%rcx), %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm1, 32(%rax)
+; AVX512VBMI-NEXT: vmovdqu %ymm1, (%rax)
+; AVX512VBMI-NEXT: popq %rcx
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = shl i512 %a0, %a1
@@ -428,16 +353,14 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
;
; AVX512F-LABEL: lshr_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: pushq %rax
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -445,44 +368,25 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512F-NEXT: movq %r9, %rsi
-; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512F-NEXT: shrdq %cl, %r8, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r8
-; AVX512F-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rax), %r15
-; AVX512F-NEXT: shrdq %cl, %r15, %rbx
-; AVX512F-NEXT: shrdq %cl, %r9, %r14
+; AVX512F-NEXT: vmovdqu64 -128(%rsp,%rax), %zmm2
+; AVX512F-NEXT: vpsrlq %xmm0, %zmm2, %zmm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: shrxq %rcx, %r15, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r8, 24(%rdi)
-; AVX512F-NEXT: movq %rdx, 16(%rdi)
-; AVX512F-NEXT: movq %rsi, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: pushq %rax
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
@@ -498,91 +402,52 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
; AVX512VL-NEXT: shrl $3, %eax
; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512VL-NEXT: movq %r9, %rsi
-; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512VL-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r8
-; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15
-; AVX512VL-NEXT: shrdq %cl, %r9, %r15
+; AVX512VL-NEXT: vmovdqu64 -128(%rsp,%rax), %zmm1
+; AVX512VL-NEXT: vpsrlq %xmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r8, 24(%rdi)
-; AVX512VL-NEXT: movq %rdx, 16(%rdi)
-; AVX512VL-NEXT: movq %rsi, 8(%rdi)
-; AVX512VL-NEXT: movq %r15, (%rdi)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: popq %rcx
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %eax, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %eax
-; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512VBMI-NEXT: movq %r9, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r15, (%rdi)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX512VBMI-NEXT: shrl $3, %edi
+; AVX512VBMI-NEXT: andl $56, %edi
+; AVX512VBMI-NEXT: vmovdqu64 -128(%rsp,%rdi), %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm1, 32(%rax)
+; AVX512VBMI-NEXT: vmovdqu %ymm1, (%rax)
+; AVX512VBMI-NEXT: popq %rcx
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = lshr i512 %a0, %a1
@@ -711,14 +576,12 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
;
; AVX512F-LABEL: ashr_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
+; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -735,49 +598,32 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512F-NEXT: movq %r9, %rsi
-; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512F-NEXT: shrdq %cl, %r8, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r8
-; AVX512F-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rax), %r15
-; AVX512F-NEXT: shrdq %cl, %r15, %rbx
-; AVX512F-NEXT: shrdq %cl, %r9, %r14
+; AVX512F-NEXT: vmovdqu64 -128(%rsp,%rax), %zmm1
+; AVX512F-NEXT: vpsrlq %xmm0, %zmm1, %zmm2
+; AVX512F-NEXT: vpsraq $63, -72(%rsp,%rax){1to8}, %zmm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm3[7,0,1,2,3,4,5,6]
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: sarxq %rcx, %r15, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r8, 24(%rdi)
-; AVX512F-NEXT: movq %rdx, 16(%rdi)
-; AVX512F-NEXT: movq %rsi, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -793,97 +639,62 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %eax
-; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512VL-NEXT: movq %r9, %rsi
-; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512VL-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r8
-; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rax), %r15
-; AVX512VL-NEXT: shrdq %cl, %r9, %r15
+; AVX512VL-NEXT: shrl $3, %ecx
+; AVX512VL-NEXT: andl $56, %ecx
+; AVX512VL-NEXT: vpsraq $63, -72(%rsp,%rcx){1to8}, %zmm0
+; AVX512VL-NEXT: vmovdqu64 -128(%rsp,%rcx), %zmm1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,15]
+; AVX512VL-NEXT: vpermi2q %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm2, %zmm0
+; AVX512VL-NEXT: andl $63, %eax
+; AVX512VL-NEXT: vpbroadcastq %rax, %xmm2
+; AVX512VL-NEXT: vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm3
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm1, %zmm1
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r8, 24(%rdi)
-; AVX512VL-NEXT: movq %rdx, 16(%rdi)
-; AVX512VL-NEXT: movq %rsi, 8(%rdi)
-; AVX512VL-NEXT: movq %r15, (%rdi)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vporq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
+; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VBMI-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: sarq $63, %r10
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %eax, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %eax
-; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512VBMI-NEXT: movq %r9, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rax), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rax), %r15
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %r15
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r15, (%rdi)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: sarq $63, %rdi
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vpbroadcastq %r10, %zmm0
+; AVX512VBMI-NEXT: # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX512VBMI-NEXT: shrl $3, %r10d
+; AVX512VBMI-NEXT: andl $56, %r10d
+; AVX512VBMI-NEXT: vpsraq $63, -72(%rsp,%r10){1to8}, %zmm1
+; AVX512VBMI-NEXT: vmovdqu64 -128(%rsp,%r10), %zmm2
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm3 = [1,2,3,4,5,6,7,15]
+; AVX512VBMI-NEXT: vpermi2q %zmm1, %zmm2, %zmm3
+; AVX512VBMI-NEXT: vpshrdvq %zmm0, %zmm3, %zmm2
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm2, 32(%rax)
+; AVX512VBMI-NEXT: vmovdqu %ymm2, (%rax)
+; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = ashr i512 %a0, %a1
ret i512 %r
@@ -1008,162 +819,64 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: shl_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rsi), %zmm0
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %edx, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: negl %edx
-; AVX512F-NEXT: movslq %edx, %r8
-; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %r9
-; AVX512F-NEXT: shldq %cl, %rax, %r9
-; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r11
-; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %rbx
-; AVX512F-NEXT: shldq %cl, %rax, %rbx
-; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: shldq %cl, %r10, %r14
-; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
-; AVX512F-NEXT: shldq %cl, %rax, %r10
+; AVX512F-NEXT: movl %edx, %eax
+; AVX512F-NEXT: vmovq %rax, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: shlxl %edx, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpexpandq (%rsi), %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512F-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
-; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdi, %rdx
-; AVX512F-NEXT: movq %r10, 56(%rax)
-; AVX512F-NEXT: movq %r14, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 40(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %rsi, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %r8, (%rax)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shl_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vmovups (%rsi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %edx, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: negl %edx
-; AVX512VL-NEXT: movslq %edx, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VL-NEXT: movq %rax, %rsi
-; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VL-NEXT: movq %r10, %r8
-; AVX512VL-NEXT: shldq %cl, %rax, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq %r11, %rbx
-; AVX512VL-NEXT: shldq %cl, %r10, %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VL-NEXT: movq %rdi, %r10
-; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VL-NEXT: movq %r14, %r15
-; AVX512VL-NEXT: shldq %cl, %rdi, %r15
-; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VL-NEXT: shldq %cl, %r14, %rdi
-; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %r11, %rdx
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
-; AVX512VL-NEXT: movq %r15, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %rsi, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %r9, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: movl %edx, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpexpandq (%rsi), %zmm2 {%k1} {z}
+; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm3
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm2, %zmm3
+; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: shl_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %edx, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: negl %edx
-; AVX512VBMI-NEXT: movslq %edx, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VBMI-NEXT: movq %rax, %rsi
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VBMI-NEXT: movq %r10, %r8
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq %r11, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: movq %rdi, %r10
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VBMI-NEXT: movq %r14, %r15
-; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
-; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT: movq %r15, 48(%rax)
-; AVX512VBMI-NEXT: movq %r10, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT: movq %r8, 24(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
-; AVX512VBMI-NEXT: movq %r9, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: movl %edx, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edi
+; AVX512VBMI-NEXT: shlxl %ecx, %edi, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm0[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -1276,141 +989,67 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: lshr_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rsi), %zmm0
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %edx, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512F-NEXT: movq %rax, %r8
-; AVX512F-NEXT: shrdq %cl, %rsi, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512F-NEXT: shrdq %cl, %r9, %rsi
-; AVX512F-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rdx), %rdx
-; AVX512F-NEXT: shrdq %cl, %rdx, %rbx
-; AVX512F-NEXT: shrdq %cl, %rax, %r14
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: movl %edx, %eax
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: shlxl %edx, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllq %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: shrxq %rcx, %rdx, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r9, 24(%rdi)
-; AVX512F-NEXT: movq %rsi, 16(%rdi)
-; AVX512F-NEXT: movq %r8, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rsi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %edx, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512VL-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VL-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rdx), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rdx
-; AVX512VL-NEXT: shrdq %cl, %rax, %rdx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r9, 24(%rdi)
-; AVX512VL-NEXT: movq %rsi, 16(%rdi)
-; AVX512VL-NEXT: movq %r8, 8(%rdi)
-; AVX512VL-NEXT: movq %rdx, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: movl %edx, %ecx
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm2
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpand %xmm1, %xmm2, %xmm3
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vpandn %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VL-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %edx, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VBMI-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rdx), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %rdx
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: movl %edx, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %esi
+; AVX512VBMI-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -1539,174 +1178,76 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: ashr_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rsi), %ymm0
-; AVX512F-NEXT: vmovaps 32(%rsi), %xmm1
-; AVX512F-NEXT: movq 48(%rsi), %rax
-; AVX512F-NEXT: movq 56(%rsi), %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: sarq $63, %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %edx, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512F-NEXT: movq %rax, %r8
-; AVX512F-NEXT: shrdq %cl, %rsi, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512F-NEXT: shrdq %cl, %r9, %rsi
-; AVX512F-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rdx), %rdx
-; AVX512F-NEXT: shrdq %cl, %rdx, %rbx
-; AVX512F-NEXT: shrdq %cl, %rax, %r14
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: movl %edx, %eax
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT: vmovq %rax, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpand %xmm3, %xmm0, %xmm4
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: sarxq %rcx, %rdx, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r9, 24(%rdi)
-; AVX512F-NEXT: movq %rsi, 16(%rdi)
-; AVX512F-NEXT: movq %r8, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rsi), %ymm0
-; AVX512VL-NEXT: vmovaps 32(%rsi), %xmm1
-; AVX512VL-NEXT: movq 48(%rsi), %rax
-; AVX512VL-NEXT: movq 56(%rsi), %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: sarq $63, %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %edx, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512VL-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VL-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rdx), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rdx
-; AVX512VL-NEXT: shrdq %cl, %rax, %rdx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r9, 24(%rdi)
-; AVX512VL-NEXT: movq %rsi, 16(%rdi)
-; AVX512VL-NEXT: movq %r8, 8(%rdi)
-; AVX512VL-NEXT: movq %rdx, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: movl %edx, %ecx
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: movl $-1, %esi
+; AVX512VL-NEXT: shlxl %edx, %esi, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm1, %zmm2
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT: vpand %xmm0, %xmm3, %xmm4
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vporq %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT: vmovaps 32(%rsi), %xmm1
-; AVX512VBMI-NEXT: movq 48(%rsi), %rax
-; AVX512VBMI-NEXT: movq 56(%rsi), %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: sarq $63, %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %edx, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -112(%rsp,%rdx), %rsi
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rdx), %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %rsi
-; AVX512VBMI-NEXT: movq -96(%rsp,%rdx), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rdx), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rdx), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rdx), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %rdx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
+; AVX512VBMI-NEXT: vpsraq $63, %zmm0, %zmm2
+; AVX512VBMI-NEXT: vpermq %zmm2, %zmm1, %zmm1
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm2
+; AVX512VBMI-NEXT: movl %edx, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 %zmm1, %zmm3
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm3 {%k1}
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm0 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm2, %zmm0, %zmm3
+; AVX512VBMI-NEXT: vmovdqu64 %zmm3, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -2223,160 +1764,39 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
;
; AVX512F-LABEL: shl_1_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %esi, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %esi
-; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: negl %esi
-; AVX512F-NEXT: movslq %esi, %r8
-; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %r9
-; AVX512F-NEXT: shldq %cl, %rax, %r9
-; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r11
-; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %rbx
-; AVX512F-NEXT: shldq %cl, %rax, %rbx
-; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: shldq %cl, %r10, %r14
-; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
-; AVX512F-NEXT: shldq %cl, %rax, %r10
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
-; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdi, %rdx
-; AVX512F-NEXT: movq %r10, 56(%rax)
-; AVX512F-NEXT: movq %r14, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 40(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %rsi, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %r8, (%rax)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: movl $1, %ecx
+; AVX512F-NEXT: shlxq %rsi, %rcx, %rdx
+; AVX512F-NEXT: shrl $6, %esi
+; AVX512F-NEXT: shlxq %rsi, %rcx, %rcx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpbroadcastq %rdx, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shl_1_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %esi
-; AVX512VL-NEXT: andl $56, %esi
-; AVX512VL-NEXT: negl %esi
-; AVX512VL-NEXT: movslq %esi, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VL-NEXT: movq %rax, %rsi
-; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VL-NEXT: movq %r10, %r8
-; AVX512VL-NEXT: shldq %cl, %rax, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq %r11, %rbx
-; AVX512VL-NEXT: shldq %cl, %r10, %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VL-NEXT: movq %rdi, %r10
-; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VL-NEXT: movq %r14, %r15
-; AVX512VL-NEXT: shldq %cl, %rdi, %r15
-; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VL-NEXT: shldq %cl, %r14, %rdi
-; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %r11, %rdx
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
-; AVX512VL-NEXT: movq %r15, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %rsi, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %r9, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: movl $1, %ecx
+; AVX512VL-NEXT: shlxq %rsi, %rcx, %rdx
+; AVX512VL-NEXT: shrl $6, %esi
+; AVX512VL-NEXT: shlxq %rsi, %rcx, %rcx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpbroadcastq %rdx, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: shl_1_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %esi, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: negl %esi
-; AVX512VBMI-NEXT: movslq %esi, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VBMI-NEXT: movq %rax, %rsi
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VBMI-NEXT: movq %r10, %r8
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq %r11, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: movq %rdi, %r10
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VBMI-NEXT: movq %r14, %r15
-; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
-; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT: movq %r15, 48(%rax)
-; AVX512VBMI-NEXT: movq %r10, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT: movq %r8, 24(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
-; AVX512VBMI-NEXT: movq %r9, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: movl $1, %ecx
+; AVX512VBMI-NEXT: shlxq %rsi, %rcx, %rdx
+; AVX512VBMI-NEXT: shrl $6, %esi
+; AVX512VBMI-NEXT: shlxq %rsi, %rcx, %rcx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = shl i512 1, %a0
@@ -2485,139 +1905,67 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
;
; AVX512F-LABEL: lshr_signbit_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %esi, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %esi
-; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512F-NEXT: movq %rax, %r8
-; AVX512F-NEXT: shrdq %cl, %rdx, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512F-NEXT: shrdq %cl, %r9, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi
-; AVX512F-NEXT: shrdq %cl, %rsi, %rbx
-; AVX512F-NEXT: shrdq %cl, %rax, %r14
+; AVX512F-NEXT: movl %esi, %eax
+; AVX512F-NEXT: vmovq %rax, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: shrl $6, %esi
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: shlxl %esi, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: shrxq %rcx, %rsi, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r9, 24(%rdi)
-; AVX512F-NEXT: movq %rdx, 16(%rdi)
-; AVX512F-NEXT: movq %r8, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_signbit_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %esi
-; AVX512VL-NEXT: andl $56, %esi
-; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi
-; AVX512VL-NEXT: shrdq %cl, %rax, %rsi
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r9, 24(%rdi)
-; AVX512VL-NEXT: movq %rdx, 16(%rdi)
-; AVX512VL-NEXT: movq %r8, 8(%rdi)
-; AVX512VL-NEXT: movq %rsi, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: shrl $6, %esi
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %esi, %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm2 {%k1} {z}
+; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm3
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm2, %zmm3
+; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_signbit_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %esi, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%s = shl i512 1, 511
@@ -2732,141 +2080,70 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
;
; AVX512F-LABEL: ashr_signbit_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: pushq %rax
+; AVX512F-NEXT: movl %esi, %eax
+; AVX512F-NEXT: shrl $6, %esi
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %esi, %ecx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 = -1
-; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %esi, %ecx
-; AVX512F-NEXT: andl $63, %ecx
-; AVX512F-NEXT: shrl $3, %esi
-; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512F-NEXT: movq %rax, %r8
-; AVX512F-NEXT: shrdq %cl, %rdx, %r8
-; AVX512F-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512F-NEXT: shrdq %cl, %r9, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rsi), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rsi), %rsi
-; AVX512F-NEXT: shrdq %cl, %rsi, %rbx
-; AVX512F-NEXT: shrdq %cl, %rax, %r14
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm2 {%k1}
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpand %xmm3, %xmm1, %xmm4
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512F-NEXT: vpandn %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm2[1,2,3,4,5,6,7],zmm0[0]
+; AVX512F-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT: vpsllq %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: sarxq %rcx, %rsi, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r9, 24(%rdi)
-; AVX512F-NEXT: movq %rdx, 16(%rdi)
-; AVX512F-NEXT: movq %r8, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_signbit_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: andl $63, %ecx
-; AVX512VL-NEXT: shrl $3, %esi
-; AVX512VL-NEXT: andl $56, %esi
-; AVX512VL-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VL-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rsi), %r14
-; AVX512VL-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT: movq -128(%rsp,%rsi), %rsi
-; AVX512VL-NEXT: shrdq %cl, %rax, %rsi
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VL-NEXT: movq %rcx, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r9, 24(%rdi)
-; AVX512VL-NEXT: movq %rdx, 16(%rdi)
-; AVX512VL-NEXT: movq %r8, 8(%rdi)
-; AVX512VL-NEXT: movq %rsi, (%rdi)
-; AVX512VL-NEXT: addq $8, %rsp
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: shrl $6, %esi
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %esi, %edx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT: vpand %xmm0, %xmm3, %xmm4
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512VL-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT: vporq %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_signbit_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %esi, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: movq -112(%rsp,%rsi), %rdx
-; AVX512VBMI-NEXT: movq -120(%rsp,%rsi), %rax
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r8
-; AVX512VBMI-NEXT: movq -104(%rsp,%rsi), %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT: movq -96(%rsp,%rsi), %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq -88(%rsp,%rsi), %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -80(%rsp,%rsi), %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: movq -72(%rsp,%rsi), %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi), %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %rsi
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: sarxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT: movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, (%rdi)
-; AVX512VBMI-NEXT: addq $8, %rsp
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm0
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%s = shl i512 1, 511
@@ -3099,63 +2376,51 @@ define i64 @lshr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: lshr_extract_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: vmovups (%rdi), %zmm0
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %ecx, %edx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: movl %ecx, %eax
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %eax, %edx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512F-NEXT: shrdq %cl, %rdx, %rax
-; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_extract_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1
; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %ecx, %edx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %eax, %edx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VL-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT: popq %rcx
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_extract_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rdi), %ymm1
; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %ecx, %edx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VBMI-NEXT: movl %ecx, %eax
+; AVX512VBMI-NEXT: shrl $6, %eax
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %eax, %edx, %eax
+; AVX512VBMI-NEXT: kmovd %eax, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512VBMI-NEXT: vmovq %xmm0, %rax
; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT: popq %rcx
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -3232,96 +2497,60 @@ define i64 @ashr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: ashr_extract_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: vmovups (%rdi), %ymm0
-; AVX512F-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512F-NEXT: movq 48(%rdi), %rax
-; AVX512F-NEXT: movq 56(%rdi), %rdx
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: sarq $63, %rdx
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %ecx, %edx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512F-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: movl %ecx, %eax
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %eax, %edx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512F-NEXT: vmovq %xmm1, %rax
; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512F-NEXT: shrdq %cl, %rdx, %rax
-; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_extract_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512VL-NEXT: movq 48(%rdi), %rax
-; AVX512VL-NEXT: movq 56(%rdi), %rdx
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: sarq $63, %rdx
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %ecx, %edx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VL-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VL-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %eax, %edx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512VL-NEXT: vmovq %xmm1, %rax
; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VL-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT: popq %rcx
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_extract_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm0
; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512VBMI-NEXT: movq 48(%rdi), %rax
-; AVX512VBMI-NEXT: movq 56(%rdi), %rdx
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: sarq $63, %rdx
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %ecx, %edx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: movq -128(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT: movq -120(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VBMI-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512VBMI-NEXT: movl %ecx, %eax
+; AVX512VBMI-NEXT: shrl $6, %eax
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %eax, %edx, %eax
+; AVX512VBMI-NEXT: kmovd %eax, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512VBMI-NEXT: vmovq %xmm1, %rax
; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT: popq %rcx
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -3370,45 +2599,33 @@ define i64 @lshr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: lshr_extract_idx_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rdi), %zmm0
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: andl $7, %esi
-; AVX512F-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512F-NEXT: popq %rcx
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: shlxl %esi, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vmovq %xmm0, %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_extract_idx_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: andl $7, %esi
-; AVX512VL-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: movl $-1, %eax
+; AVX512VL-NEXT: shlxl %esi, %eax, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vmovq %xmm0, %rax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: lshr_extract_idx_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovups 32(%rdi), %ymm1
-; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: andl $7, %esi
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VBMI-NEXT: movl $-1, %eax
+; AVX512VBMI-NEXT: shlxl %esi, %eax, %eax
+; AVX512VBMI-NEXT: kmovd %eax, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vmovq %xmm0, %rax
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -3474,78 +2691,42 @@ define i64 @ashr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
;
; AVX512F-LABEL: ashr_extract_idx_load_i512_i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovups (%rdi), %ymm0
-; AVX512F-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512F-NEXT: movq 48(%rdi), %rax
-; AVX512F-NEXT: movq 56(%rdi), %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: sarq $63, %rcx
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: andl $7, %esi
-; AVX512F-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512F-NEXT: popq %rcx
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512F-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT: movl $-1, %eax
+; AVX512F-NEXT: shlxl %esi, %eax, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovq %xmm1, %rax
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: ashr_extract_idx_load_i512_i64:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rax
-; AVX512VL-NEXT: vmovups (%rdi), %ymm0
-; AVX512VL-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512VL-NEXT: movq 48(%rdi), %rax
-; AVX512VL-NEXT: movq 56(%rdi), %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: sarq $63, %rcx
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: andl $7, %esi
-; AVX512VL-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VL-NEXT: popq %rcx
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: movl $-1, %eax
+; AVX512VL-NEXT: shlxl %esi, %eax, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovq %xmm1, %rax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: ashr_extract_idx_load_i512_i64:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rax
-; AVX512VBMI-NEXT: vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT: vmovaps 32(%rdi), %xmm1
-; AVX512VBMI-NEXT: movq 48(%rdi), %rax
-; AVX512VBMI-NEXT: movq 56(%rdi), %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: sarq $63, %rcx
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: andl $7, %esi
-; AVX512VBMI-NEXT: movq -128(%rsp,%rsi,8), %rax
-; AVX512VBMI-NEXT: popq %rcx
+; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VBMI-NEXT: vpsraq $63, %zmm0, %zmm1
+; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VBMI-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; AVX512VBMI-NEXT: movl $-1, %eax
+; AVX512VBMI-NEXT: shlxl %esi, %eax, %eax
+; AVX512VBMI-NEXT: kmovd %eax, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vmovq %xmm1, %rax
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 65b602801b365..e9ddc576c6cd8 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -11,10 +11,10 @@
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX1
; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-BMI2-AVX512,X64-NO-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-NO-BMI2-AVX512,X64-HAVE-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-HAVE-BMI2-AVX512,X64-NO-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-BMI2-AVX512,X64-HAVE-SHLD-HAVE-BMI2-AVX512
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-SSE2
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-SSE2
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-SSE2
@@ -27,10 +27,10 @@
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX1
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX1
; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX1
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX512
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-BMI2-AVX512,X86-NO-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-NO-BMI2-AVX512,X86-HAVE-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-HAVE-BMI2-AVX512,X86-NO-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-BMI2-AVX512,X86-HAVE-SHLD-HAVE-BMI2-AVX512
define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-NO-BMI2-LABEL: lshr_4bytes:
@@ -10617,262 +10617,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq
;
-; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %r9d
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%r9,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%r9), %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r8,%r8), %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%r9), %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %r10, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%r9), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%rbp,%rbp), %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %r14, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%r9), %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leaq (%r9,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbp, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r8, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, 56(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, 48(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, 32(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 40(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: addq $8, %rsp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq
-;
-; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %edi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rdi,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %edi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -96(%rsp,%rdi), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -104(%rsp,%rdi), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rsi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -112(%rsp,%rdi), %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -80(%rsp,%rdi), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -88(%rsp,%rdi), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -72(%rsp,%rdi), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -128(%rsp,%rdi), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -120(%rsp,%rdi), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 56(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 32(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 24(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq
-;
-; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rsi), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rsi), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rsi), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r9, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rsi), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r14, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rsi), %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leaq (%rsi,%rsi), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r15, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rax, %r9, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %rsi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rcx, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r13, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, (%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq
-;
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -104(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -112(%rsp,%rax), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -72(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -120(%rsp,%rax), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rcx, %r11, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdq %cl, %rax, %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 32(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 24(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 56(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq
+; X64-NO-BMI2-AVX512-LABEL: lshr_64bytes:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT: movl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT: leal (,%rcx,8), %eax
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq %rax, %xmm1
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; X64-NO-BMI2-AVX512-NEXT: vpand %xmm2, %xmm1, %xmm3
+; X64-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; X64-NO-BMI2-AVX512-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; X64-NO-BMI2-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X64-NO-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vporq %zmm3, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq %rcx, %xmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; X64-HAVE-BMI2-AVX512-NEXT: vpand %xmm2, %xmm1, %xmm3
+; X64-HAVE-BMI2-AVX512-NEXT: shrl $3, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; X64-HAVE-BMI2-AVX512-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X64-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X64-HAVE-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vporq %zmm3, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
;
; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
@@ -12810,563 +12602,60 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl
;
-; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 68(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 72(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %cl, %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edi,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %eax, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edi,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%edx,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (%ebx,%ebx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: retl
-;
-; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrdl %cl, %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: addl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retl
-;
-; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%ecx,8), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 68(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 72(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 80(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 76(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 88(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 84(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 96(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 92(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 104(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 100(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 112(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 108(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 120(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 116(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 124(%esp,%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (%ecx,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 60(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 56(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 48(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 52(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, 40(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, (%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 4(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retl
-;
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shll $3, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: addl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retl
+; X86-NO-BMI2-AVX512-LABEL: lshr_64bytes:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 (%ecx), %zmm0
+; X86-NO-BMI2-AVX512-NEXT: movl (%edx), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NO-BMI2-AVX512-NEXT: vpslld $3, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,0,63,0]
+; X86-NO-BMI2-AVX512-NEXT: vpand %xmm2, %xmm1, %xmm3
+; X86-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-BMI2-AVX512-NEXT: kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; X86-NO-BMI2-AVX512-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X86-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X86-NO-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vporq %zmm3, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: movl (%ecx), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-HAVE-BMI2-AVX512-NEXT: vpslld $3, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,0,63,0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpand %xmm2, %xmm1, %xmm3
+; X86-HAVE-BMI2-AVX512-NEXT: shrl $3, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm3, %zmm0, %zmm3
+; X86-HAVE-BMI2-AVX512-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; X86-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vporq %zmm3, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -13474,26 +12763,29 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
-; X64-AVX512-LABEL: lshr_64bytes_qwordOff:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: pushq %rax
-; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-AVX512-NEXT: movl (%rsi), %eax
-; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: andl $7, %eax
-; X64-AVX512-NEXT: vmovups -128(%rsp,%rax,8), %xmm0
-; X64-AVX512-NEXT: vmovups -112(%rsp,%rax,8), %xmm1
-; X64-AVX512-NEXT: vmovups -96(%rsp,%rax,8), %xmm2
-; X64-AVX512-NEXT: vmovups -80(%rsp,%rax,8), %xmm3
-; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT: popq %rax
-; X64-AVX512-NEXT: vzeroupper
-; X64-AVX512-NEXT: retq
+; X64-NO-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
;
; X86-SSE2-LABEL: lshr_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
@@ -13682,29 +12974,35 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
-; X86-AVX512-LABEL: lshr_64bytes_qwordOff:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: subl $140, %esp
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: vmovups (%edx), %zmm0
-; X86-AVX512-NEXT: movl (%ecx), %ecx
-; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT: vmovups %zmm0, (%esp)
-; X86-AVX512-NEXT: andl $7, %ecx
-; X86-AVX512-NEXT: vmovups (%esp,%ecx,8), %xmm0
-; X86-AVX512-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
-; X86-AVX512-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
-; X86-AVX512-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
-; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT: addl $140, %esp
-; X86-AVX512-NEXT: vzeroupper
-; X86-AVX512-NEXT: retl
+; X86-NO-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm0
+; X86-NO-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-BMI2-AVX512-NEXT: kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%qwordOff = load i512, ptr %qwordOff.ptr, align 1
%bitOff = shl i512 %qwordOff, 6
@@ -14606,267 +13904,52 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq
;
-; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rcx,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movslq %ecx, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r9), %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %esi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r9), %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r9), %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r11, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r9), %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r15, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r12, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r13, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r9), %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: orq %r9, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r12, 56(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, 48(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 8(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 16(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, 24(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 32(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 40(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT: retq
-;
-; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movslq %eax, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r14, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shlq %cl, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldq %cl, %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r8, 56(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r15, 48(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r10, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rdi, 24(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rax, 32(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movq %r9, (%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retq
-;
-; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rdi, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r8, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r9, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r10, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%rsi), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r10, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r11, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%rsi), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r14, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %rbx, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%rsi), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r15, %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %r12, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, -8(%rsp,%rsi), %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxq %rax, %r15, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orq %rcx, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, (%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 48(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addq $8, %rsp
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retq
-;
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movslq %eax, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -24(%rsp,%r8), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -40(%rsp,%r8), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -48(%rsp,%r8), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -56(%rsp,%r8), %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -16(%rsp,%r8), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r14, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq -8(%rsp,%r8), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r14, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxq %rcx, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldq %cl, %r11, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r8, 56(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r15, 48(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rbx, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r10, 16(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rdi, 24(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rax, 32(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %rsi, 40(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movq %r9, (%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retq
+; X64-NO-BMI2-AVX512-LABEL: shl_64bytes:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: movl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT: leal (,%rcx,8), %eax
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq %rax, %xmm0
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; X64-NO-BMI2-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X64-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT: vpexpandq (%rdi), %zmm3 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; X64-NO-BMI2-AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; X64-NO-BMI2-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X64-NO-BMI2-AVX512-NEXT: vpsrlq $1, %zmm1, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq %rcx, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; X64-HAVE-BMI2-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X64-HAVE-BMI2-AVX512-NEXT: shrl $3, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vpexpandq (%rdi), %zmm3 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; X64-HAVE-BMI2-AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X64-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X64-HAVE-BMI2-AVX512-NEXT: vpsrlq $1, %zmm1, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
;
; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
@@ -16892,596 +15975,62 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl
;
-; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-NO-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: subl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl (%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 4(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 12(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 8(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 20(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 16(%edi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 28(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 24(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 36(%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 32(%edx), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 44(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 40(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: negl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 176(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%edi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: addl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT: retl
-;
-; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-HAVE-SHLD-NO-BMI2-AVX512: # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: subl %ebp, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 8(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 12(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll $3, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 4(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 16(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 20(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 24(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 28(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 32(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 36(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 40(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 44(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 56(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 60(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%eax), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: negl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl 160(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edi, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shll %cl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: shldl %cl, %edi, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %esi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: movl %edx, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: addl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: popl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: vzeroupper
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT: retl
-;
-; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-NO-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%eax,8), %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: subl %eax, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 4(%edx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: notb %bl
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 8(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 12(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 16(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 20(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 24(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 28(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 32(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 36(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 40(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %esi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 44(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 48(%edx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shrxl %ebx, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: negl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ebp, 188(%esp,%ebx), %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: orl %ecx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 56(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 60(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 48(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 52(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 40(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 44(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 32(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 36(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 24(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 16(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT: retl
-;
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512: # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl $204, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal (,%ebx,8), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: andl $60, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: subl %ebx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 4(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 8(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 12(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 16(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 20(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 24(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 28(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 32(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 36(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 40(%eax), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 44(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %ebp, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 56(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 60(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl (%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: negl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl 176(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edx, 56(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 60(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %ebx, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: shldl %cl, %edx, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebx, 48(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %esi, 52(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ebp, 40(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 44(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 32(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 36(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, 12(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %edi, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: movl %ecx, (%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: addl $204, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: popl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: vzeroupper
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT: retl
+; X86-NO-BMI2-AVX512-LABEL: shl_64bytes:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: pushl %esi
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: movl (%esi), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NO-BMI2-AVX512-NEXT: vpslld $3, %xmm0, %xmm0
+; X86-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,0,63,0]
+; X86-NO-BMI2-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X86-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-BMI2-AVX512-NEXT: kmovw %esi, %k1
+; X86-NO-BMI2-AVX512-NEXT: vpexpandq (%edx), %zmm3 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; X86-NO-BMI2-AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; X86-NO-BMI2-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X86-NO-BMI2-AVX512-NEXT: vpsrlq $1, %zmm1, %zmm1
+; X86-NO-BMI2-AVX512-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: popl %esi
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: pushl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl (%edx), %esi
+; X86-HAVE-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-HAVE-BMI2-AVX512-NEXT: vpslld $3, %xmm0, %xmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,0,63,0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpand %xmm1, %xmm0, %xmm2
+; X86-HAVE-BMI2-AVX512-NEXT: shrl $3, %esi
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %esi, %edx, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %edx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vpexpandq (%ecx), %zmm3 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; X86-HAVE-BMI2-AVX512-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X86-HAVE-BMI2-AVX512-NEXT: vpsrlq $1, %zmm1, %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vporq %zmm0, %zmm2, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: popl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -17598,29 +16147,27 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
; X64-AVX1-NEXT: vzeroupper
; X64-AVX1-NEXT: retq
;
-; X64-AVX512-LABEL: shl_64bytes_qwordOff:
-; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: pushq %rax
-; X64-AVX512-NEXT: vmovups (%rdi), %zmm0
-; X64-AVX512-NEXT: movl (%rsi), %eax
-; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT: shll $3, %eax
-; X64-AVX512-NEXT: andl $56, %eax
-; X64-AVX512-NEXT: negl %eax
-; X64-AVX512-NEXT: cltq
-; X64-AVX512-NEXT: vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT: vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT: vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT: vmovups -16(%rsp,%rax), %xmm3
-; X64-AVX512-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT: popq %rax
-; X64-AVX512-NEXT: vzeroupper
-; X64-AVX512-NEXT: retq
+; X64-NO-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT: vpexpandq (%rdi), %zmm0 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vpexpandq (%rdi), %zmm0 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
;
; X86-SSE2-LABEL: shl_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
@@ -17821,33 +16368,37 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
; X86-AVX1-NEXT: vzeroupper
; X86-AVX1-NEXT: retl
;
-; X86-AVX512-LABEL: shl_64bytes_qwordOff:
-; X86-AVX512: # %bb.0:
-; X86-AVX512-NEXT: subl $140, %esp
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: vmovups (%edx), %zmm0
-; X86-AVX512-NEXT: movl (%ecx), %ecx
-; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT: vmovups %zmm1, (%esp)
-; X86-AVX512-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT: shll $3, %ecx
-; X86-AVX512-NEXT: andl $56, %ecx
-; X86-AVX512-NEXT: leal {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT: subl %ecx, %edx
-; X86-AVX512-NEXT: vmovups (%edx), %xmm0
-; X86-AVX512-NEXT: vmovups 16(%edx), %xmm1
-; X86-AVX512-NEXT: vmovups 32(%edx), %xmm2
-; X86-AVX512-NEXT: negl %ecx
-; X86-AVX512-NEXT: vmovups 112(%esp,%ecx), %xmm3
-; X86-AVX512-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX512-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX512-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX512-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT: addl $140, %esp
-; X86-AVX512-NEXT: vzeroupper
-; X86-AVX512-NEXT: retl
+; X86-NO-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: pushl %esi
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-BMI2-AVX512-NEXT: kmovw %esi, %k1
+; X86-NO-BMI2-AVX512-NEXT: vpexpandq (%edx), %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: popl %esi
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: pushl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %esi
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vpexpandq (%edx), %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: popl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%qwordOff = load i512, ptr %qwordOff.ptr, align 1
%bitOff = shl i512 %qwordOff, 6
@@ -18501,321 +17052,378 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popq %r15
; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retq
;
-; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-NO-SHLD-NO-BMI2-AVX: # %bb.0:
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: pushq %rax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rcx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %edi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rcx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leal (,%rdi,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rdi), %r9
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r9,%r9), %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rdi), %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%r12,%r12), %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rdi), %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r10, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rdi), %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rdi), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rbp,%rbp), %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r13, %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %r14, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r12, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rdi), %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: leaq (%rdi,%rdi), %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %rbp, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shrq %cl, %r9
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: orq %r9, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 56(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 8(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r12, 48(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r14, 32(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r15, 40(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r10, 16(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 24(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: movq %r8, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: addq $8, %rsp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: popq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT: retq
-;
-; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r10, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rax, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarq %cl, %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r11, 56(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rdi, 32(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: movq %r14, (%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: popq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT: retq
+; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-NO-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 48(%rdi), %rax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq 56(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %edi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: sarq $63, %rcx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rdi,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %edi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r9,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r10, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r14, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%rdi), %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r13, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %r14, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r12, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: leaq (%rdi,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: orq %r9, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: sarq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r12, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: popq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT: retq
;
-; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %cl
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r9, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r14, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: leaq (%rax,%rax), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r15, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxq %rsi, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: orq %r10, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rsi, %rax, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 48(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r13, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, (%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT: retq
+; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 48(%rdi), %rcx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: sarq %cl, %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r11, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retq
;
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 48(%rdi), %rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq 56(%rdi), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -104(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -112(%rsp,%rax), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -88(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r11, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -72(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq -120(%rsp,%rax), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rax, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxq %rcx, %r11, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdq %cl, %rax, %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rdi, 32(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %rsi, 24(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r14, (%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movq %r10, 56(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retq
+; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 48(%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq 56(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, -128(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r10,%r10), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r14,%r14), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %rbx, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %rbx, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%r12,%r12), %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r13, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r15, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %r12, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leaq (%rax,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r14, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxq %rsi, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxq %rcx, %r9, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orq %r10, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: sarxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retq
;
-; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
-; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 48(%rdi), %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rax, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: sarxq %rcx, %r11, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r14, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retq
+;
+; X64-NO-BMI2-AVX512-LABEL: ashr_64bytes:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-NO-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-NO-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %esi
+; X64-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X64-NO-BMI2-AVX512-NEXT: kmovw %esi, %k1
+; X64-NO-BMI2-AVX512-NEXT: vmovdqa64 %zmm1, %zmm2
+; X64-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
+; X64-NO-BMI2-AVX512-NEXT: shll $3, %eax
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq %rax, %xmm0
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; X64-NO-BMI2-AVX512-NEXT: vpand %xmm3, %xmm0, %xmm4
+; X64-NO-BMI2-AVX512-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; X64-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; X64-NO-BMI2-AVX512-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; X64-NO-BMI2-AVX512-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vporq %zmm4, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: ashr_64bytes:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: movl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-HAVE-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: movl %eax, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %esi
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %esi, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %ecx, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqa64 %zmm1, %zmm2
+; X64-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm2 {%k1}
+; X64-HAVE-BMI2-AVX512-NEXT: shll $3, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq %rax, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; X64-HAVE-BMI2-AVX512-NEXT: vpand %xmm3, %xmm0, %xmm4
+; X64-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; X64-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; X64-HAVE-BMI2-AVX512-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpandn %xmm3, %xmm0, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm0, %zmm1, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vporq %zmm4, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
+;
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: pushl %esi
; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: subl $204, %esp
; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NO-SHLD-NO-BMI2-SSE2-NEXT: movl (%ecx), %eax
@@ -20304,668 +18912,733 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: popl %ebp
; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT: retl
;
-; X86-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-NO-SHLD-NO-BMI2-AVX: # %bb.0:
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%ecx), %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 48(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 52(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 56(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 60(%ecx), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll $3, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %eax, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edi,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%edx,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%eax,%eax), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: leal (%ebx,%ebx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: addl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT: retl
-;
-; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-HAVE-SHLD-NO-BMI2-AVX: # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups (%eax), %ymm0
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups 32(%eax), %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%eax), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%ecx), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl $31, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shll $3, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: shrdl %cl, %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: sarl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: addl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: popl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: vzeroupper
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT: retl
-;
-; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-NO-SHLD-HAVE-BMI2-AVX: # %bb.0:
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%ecx), %ymm0
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%ecx), %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl (%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (,%eax,8), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $24, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: andl $60, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: notb %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%edi,%edi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 96(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 92(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 104(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 100(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 112(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 108(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 120(%esp,%ebp), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%edi,%edi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ecx, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 116(%esp,%ebp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl 124(%esp,%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shlxl %edx, %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: shrxl %ebx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, 60(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 56(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 48(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 52(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 40(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 44(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 32(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 36(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 24(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 28(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 16(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 20(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 8(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 12(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, (%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 4(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: addl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT: retl
+; X86-NO-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-NO-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%ecx), %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 48(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: notb %ch
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 76(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 80(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 84(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 88(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 92(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 96(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 100(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 104(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 108(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 112(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 116(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 120(%esp,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl 124(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT: retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX1: # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups 32(%eax), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 48(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: sarl $31, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 88(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 104(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 100(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 48(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl 108(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: sarl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %edi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: vzeroupper
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT: retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%ecx), %ymm0
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%ecx), %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (,%eax,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 68(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 72(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: notb %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%edi,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 80(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 76(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 84(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 96(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 92(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 100(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 112(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 108(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 120(%esp,%ebp), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%edi,%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 116(%esp,%ebp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl 124(%esp,%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: leal (%eax,%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shlxl %edx, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: shrxl %ebx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: sarxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 48(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 52(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 36(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 20(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 8(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 12(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 4(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT: retl
;
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX: # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: subl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups (%eax), %ymm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups 32(%eax), %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%eax), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl (%ecx), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarl $31, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shll $3, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: addl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: popl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: vzeroupper
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT: retl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1: # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups 32(%eax), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: sarl $31, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shll $3, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 88(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 104(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 100(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 48(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl 108(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebp, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 56(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edx, 52(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ebx, 40(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %edi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %ecx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: vzeroupper
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT: retl
+;
+; X86-NO-BMI2-AVX512-LABEL: ashr_64bytes:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: pushl %esi
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 (%ecx), %zmm1
+; X86-NO-BMI2-AVX512-NEXT: movl (%edx), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vpsraq $63, %zmm1, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-NO-BMI2-AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: shrl $3, %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %esi
+; X86-NO-BMI2-AVX512-NEXT: kmovw %esi, %k1
+; X86-NO-BMI2-AVX512-NEXT: vmovdqa64 %zmm0, %zmm2
+; X86-NO-BMI2-AVX512-NEXT: vpcompressq %zmm1, %zmm2 {%k1}
+; X86-NO-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NO-BMI2-AVX512-NEXT: vpslld $3, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,0,63,0]
+; X86-NO-BMI2-AVX512-NEXT: vpand %xmm3, %xmm1, %xmm4
+; X86-NO-BMI2-AVX512-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; X86-NO-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm2[1,2,3,4,5,6,7],zmm0[0]
+; X86-NO-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vpandn %xmm3, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vporq %zmm4, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: popl %esi
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: ashr_64bytes:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: pushl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT: movl (%ecx), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vpsraq $63, %zmm1, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpermq %zmm0, %zmm2, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: shrl $3, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %esi
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %edx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqa64 %zmm0, %zmm2
+; X86-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm1, %zmm2 {%k1}
+; X86-HAVE-BMI2-AVX512-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-HAVE-BMI2-AVX512-NEXT: vpslld $3, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,0,63,0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpand %xmm3, %xmm1, %xmm4
+; X86-HAVE-BMI2-AVX512-NEXT: vpsrlq %xmm4, %zmm2, %zmm4
+; X86-HAVE-BMI2-AVX512-NEXT: valignq {{.*#+}} zmm0 = zmm2[1,2,3,4,5,6,7],zmm0[0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpaddq %zmm0, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vpandn %xmm3, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpsllq %xmm1, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vporq %zmm4, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: popl %esi
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%byteOff = load i512, ptr %byteOff.ptr, align 1
%bitOff = shl i512 %byteOff, 3
@@ -21059,39 +19732,69 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
; X64-SSE42-NEXT: popq %rax
; X64-SSE42-NEXT: retq
;
-; X64-AVX-LABEL: ashr_64bytes_qwordOff:
-; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: pushq %rax
-; X64-AVX-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX-NEXT: vmovups 32(%rdi), %xmm1
-; X64-AVX-NEXT: movq 48(%rdi), %rax
-; X64-AVX-NEXT: movq 56(%rdi), %rcx
-; X64-AVX-NEXT: movl (%rsi), %esi
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: sarq $63, %rcx
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT: andl $7, %esi
-; X64-AVX-NEXT: vmovups -128(%rsp,%rsi,8), %xmm0
-; X64-AVX-NEXT: vmovups -112(%rsp,%rsi,8), %xmm1
-; X64-AVX-NEXT: vmovups -96(%rsp,%rsi,8), %xmm2
-; X64-AVX-NEXT: vmovups -80(%rsp,%rsi,8), %xmm3
-; X64-AVX-NEXT: vmovups %xmm3, 48(%rdx)
-; X64-AVX-NEXT: vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT: vmovups %xmm2, 32(%rdx)
-; X64-AVX-NEXT: vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT: popq %rax
-; X64-AVX-NEXT: vzeroupper
-; X64-AVX-NEXT: retq
+; X64-AVX1-LABEL: ashr_64bytes_qwordOff:
+; X64-AVX1: # %bb.0:
+; X64-AVX1-NEXT: pushq %rax
+; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1
+; X64-AVX1-NEXT: movq 48(%rdi), %rax
+; X64-AVX1-NEXT: movq 56(%rdi), %rcx
+; X64-AVX1-NEXT: movl (%rsi), %esi
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: sarq $63, %rcx
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT: andl $7, %esi
+; X64-AVX1-NEXT: vmovups -128(%rsp,%rsi,8), %xmm0
+; X64-AVX1-NEXT: vmovups -112(%rsp,%rsi,8), %xmm1
+; X64-AVX1-NEXT: vmovups -96(%rsp,%rsi,8), %xmm2
+; X64-AVX1-NEXT: vmovups -80(%rsp,%rsi,8), %xmm3
+; X64-AVX1-NEXT: vmovups %xmm3, 48(%rdx)
+; X64-AVX1-NEXT: vmovups %xmm1, 16(%rdx)
+; X64-AVX1-NEXT: vmovups %xmm2, 32(%rdx)
+; X64-AVX1-NEXT: vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT: popq %rax
+; X64-AVX1-NEXT: vzeroupper
+; X64-AVX1-NEXT: retq
+;
+; X64-NO-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X64-NO-BMI2-AVX512: # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT: movzbl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-NO-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X64-NO-BMI2-AVX512-NEXT: movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT: shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; X64-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm1, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT: vzeroupper
+; X64-NO-BMI2-AVX512-NEXT: retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X64-HAVE-BMI2-AVX512: # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT: movzbl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-HAVE-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT: movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT: shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT: kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; X64-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm1, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT: retq
;
; X86-SSE2-LABEL: ashr_64bytes_qwordOff:
; X86-SSE2: # %bb.0:
@@ -21289,60 +19992,96 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
; X86-SSE42-NEXT: popl %ebx
; X86-SSE42-NEXT: retl
;
-; X86-AVX-LABEL: ashr_64bytes_qwordOff:
-; X86-AVX: # %bb.0:
-; X86-AVX-NEXT: pushl %ebx
-; X86-AVX-NEXT: pushl %edi
-; X86-AVX-NEXT: pushl %esi
-; X86-AVX-NEXT: subl $128, %esp
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT: vmovups (%edx), %ymm0
-; X86-AVX-NEXT: vmovups 32(%edx), %xmm1
-; X86-AVX-NEXT: movl 48(%edx), %esi
-; X86-AVX-NEXT: movl 52(%edx), %edi
-; X86-AVX-NEXT: movl 56(%edx), %ebx
-; X86-AVX-NEXT: movl 60(%edx), %edx
-; X86-AVX-NEXT: movl (%ecx), %ecx
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: vmovups %ymm0, (%esp)
-; X86-AVX-NEXT: sarl $31, %edx
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT: andl $7, %ecx
-; X86-AVX-NEXT: vmovups (%esp,%ecx,8), %xmm0
-; X86-AVX-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
-; X86-AVX-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
-; X86-AVX-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
-; X86-AVX-NEXT: vmovups %xmm3, 48(%eax)
-; X86-AVX-NEXT: vmovups %xmm2, 32(%eax)
-; X86-AVX-NEXT: vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT: vmovups %xmm0, (%eax)
-; X86-AVX-NEXT: addl $128, %esp
-; X86-AVX-NEXT: popl %esi
-; X86-AVX-NEXT: popl %edi
-; X86-AVX-NEXT: popl %ebx
-; X86-AVX-NEXT: vzeroupper
-; X86-AVX-NEXT: retl
+; X86-AVX1-LABEL: ashr_64bytes_qwordOff:
+; X86-AVX1: # %bb.0:
+; X86-AVX1-NEXT: pushl %ebx
+; X86-AVX1-NEXT: pushl %edi
+; X86-AVX1-NEXT: pushl %esi
+; X86-AVX1-NEXT: subl $128, %esp
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT: vmovups (%edx), %ymm0
+; X86-AVX1-NEXT: vmovups 32(%edx), %xmm1
+; X86-AVX1-NEXT: movl 48(%edx), %esi
+; X86-AVX1-NEXT: movl 52(%edx), %edi
+; X86-AVX1-NEXT: movl 56(%edx), %ebx
+; X86-AVX1-NEXT: movl 60(%edx), %edx
+; X86-AVX1-NEXT: movl (%ecx), %ecx
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: vmovups %ymm0, (%esp)
+; X86-AVX1-NEXT: sarl $31, %edx
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT: andl $7, %ecx
+; X86-AVX1-NEXT: vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX1-NEXT: vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX1-NEXT: vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX1-NEXT: vmovups 48(%esp,%ecx,8), %xmm3
+; X86-AVX1-NEXT: vmovups %xmm3, 48(%eax)
+; X86-AVX1-NEXT: vmovups %xmm2, 32(%eax)
+; X86-AVX1-NEXT: vmovups %xmm1, 16(%eax)
+; X86-AVX1-NEXT: vmovups %xmm0, (%eax)
+; X86-AVX1-NEXT: addl $128, %esp
+; X86-AVX1-NEXT: popl %esi
+; X86-AVX1-NEXT: popl %edi
+; X86-AVX1-NEXT: popl %ebx
+; X86-AVX1-NEXT: vzeroupper
+; X86-AVX1-NEXT: retl
+;
+; X86-NO-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X86-NO-BMI2-AVX512: # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm0
+; X86-NO-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-NO-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X86-NO-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-NO-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X86-NO-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-NO-BMI2-AVX512-NEXT: shll %cl, %edx
+; X86-NO-BMI2-AVX512-NEXT: kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; X86-NO-BMI2-AVX512-NEXT: vmovdqu64 %zmm1, (%eax)
+; X86-NO-BMI2-AVX512-NEXT: vzeroupper
+; X86-NO-BMI2-AVX512-NEXT: retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X86-HAVE-BMI2-AVX512: # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 (%edx), %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT: movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: vpsraq $63, %zmm0, %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT: vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-HAVE-BMI2-AVX512-NEXT: vpermq %zmm1, %zmm2, %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT: movl $-1, %edx
+; X86-HAVE-BMI2-AVX512-NEXT: shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT: kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT: vpcompressq %zmm0, %zmm1 {%k1}
+; X86-HAVE-BMI2-AVX512-NEXT: vmovdqu64 %zmm1, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT: vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT: retl
%src = load i512, ptr %src.ptr, align 1
%qwordOff = load i512, ptr %qwordOff.ptr, align 1
%bitOff = shl i512 %qwordOff, 6
@@ -21354,4 +20093,14 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; ALL: {{.*}}
; X64: {{.*}}
+; X64-AVX512: {{.*}}
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X64-HAVE-SHLD-NO-BMI2-AVX512: {{.*}}
+; X64-NO-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X64-NO-SHLD-NO-BMI2-AVX512: {{.*}}
; X86: {{.*}}
+; X86-AVX512: {{.*}}
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X86-HAVE-SHLD-NO-BMI2-AVX512: {{.*}}
+; X86-NO-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X86-NO-SHLD-NO-BMI2-AVX512: {{.*}}
More information about the llvm-commits
mailing list