[llvm] [X86] Expand i512 shifts on AVX512 targets (PR #183198)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 6 02:20:04 PST 2026


https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/183198

>From 19ce540ab7fdbad7bcdee12e83e4606d810926b6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 10 Dec 2025 15:39:11 +0000
Subject: [PATCH] [X86] Expand i512 shifts on AVX512 targets

Cast to vXi64 and use EXPAND/COMPRESS to left/right shift the i64 elements into place, then use FSHL/FSHR to perform the final bitshift
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  110 +
 llvm/test/CodeGen/X86/bit-manip-i512.ll       | 2964 ++++-------
 llvm/test/CodeGen/X86/shift-i512.ll           | 1895 ++-----
 ...lar-shift-by-byte-multiple-legalization.ll | 4399 ++++++-----------
 4 files changed, 3188 insertions(+), 6180 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6bb558f4ef6da..bf1d36bf06a39 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1888,6 +1888,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::XOR, MVT::i512, Custom);
     setOperationAction(ISD::ADD, MVT::i512, Custom);
     setOperationAction(ISD::SUB, MVT::i512, Custom);
+    setOperationAction(ISD::SRL, MVT::i512, Custom);
+    setOperationAction(ISD::SHL, MVT::i512, Custom);
+    setOperationAction(ISD::SRA, MVT::i512, Custom);
     setOperationAction(ISD::SELECT, MVT::i512, Custom);
 
     for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
@@ -2936,6 +2939,10 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
     // Check for larger than legal scalar integer ops that might have been
     // custom lowered to vector instruction.
     switch (Opcode) {
+    case ISD::SHL:
+    case ISD::SRL:
+    case ISD::SRA:
+      return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget);
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
@@ -34431,6 +34438,92 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(DAG.getBitcast(VT, Res));
     return;
   }
+  case ISD::SHL:
+  case ISD::SRL:
+  case ISD::SRA: {
+    EVT VT = N->getValueType(0);
+    SDValue Src = N->getOperand(0);
+    SDValue Amt = N->getOperand(1);
+    assert(Subtarget.useAVX512Regs() && "AVX512F required");
+    assert(VT == MVT::i512 && "Unexpected VT!");
+    MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+    MVT BoolVT = VecVT.changeVectorElementType(MVT::i1);
+
+    if (!mayFoldIntoVector(Src, DAG, Subtarget))
+      return;
+
+    // Early out if this will fold to a constant shift of whole byte elements.
+    // TODO: Directly lower to a shuffle?
+    if (auto *AmtC = dyn_cast<ConstantSDNode>(Amt)) {
+      assert(AmtC->getAPIntValue().ult(512) && "Out of bounds shift amount");
+      if (AmtC->getAPIntValue().urem(8) == 0)
+        return;
+    }
+
+    SDValue AmtLane = DAG.getNode(ISD::SRL, dl, MVT::i32,
+                                  DAG.getZExtOrTrunc(Amt, dl, MVT::i32),
+                                  DAG.getShiftAmountConstant(6, MVT::i32, dl));
+    AmtLane = DAG.getZExtOrTrunc(AmtLane, dl, MVT::i8);
+
+    if (auto *SrcC = dyn_cast<ConstantSDNode>(Src)) {
+      // Special case: SHL(1,Amt) --> SELECT(1<<(Amt/64), SPLAT(1<<(Amt%64)), 0)
+      if (Opc == ISD::SHL && SrcC->getAPIntValue() == 1) {
+        SDValue Bit = DAG.getConstant(1, dl, MVT::i64);
+        SDValue AmtMod = DAG.getNode(ISD::AND, dl, MVT::i64,
+                                     DAG.getZExtOrTrunc(Amt, dl, MVT::i64),
+                                     DAG.getConstant(63, dl, MVT::i64));
+        SDValue LaneMask = DAG.getNode(ISD::SHL, dl, MVT::i64, Bit, AmtLane);
+        LaneMask =
+            DAG.getBitcast(BoolVT, DAG.getZExtOrTrunc(LaneMask, dl, MVT::i8));
+        SDValue Elt = DAG.getNode(ISD::SHL, dl, MVT::i64, Bit, AmtMod);
+        SDValue Res =
+            DAG.getSelect(dl, VecVT, LaneMask, DAG.getSplat(VecVT, dl, Elt),
+                          DAG.getConstant(0, dl, VecVT));
+        Results.push_back(DAG.getBitcast(VT, Res));
+        return;
+      }
+    }
+
+    // Use EXPAND/COMPRESS to shuffle the i64 elements left/right with the
+    // ShiftAmt/64 'laneshift', and then shuffle one element along to get the
+    // shifted in bits from the neighbouring element. Finally use a funnel shift
+    // with the ShiftAmt%64 'elementshift' to get the final result.
+    SDValue Mask =
+        DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                    DAG.getNode(ISD::SHL, dl, MVT::i32,
+                                DAG.getAllOnesConstant(dl, MVT::i32), AmtLane));
+    Src = DAG.getBitcast(VecVT, Src);
+
+    SDValue PassThrough;
+    if (Opc == ISD::SRA) {
+      // Splat the MSB sign bit across the vector.
+      PassThrough = DAG.getNode(ISD::SRA, dl, VecVT, Src,
+                                DAG.getShiftAmountConstant(63, VecVT, dl));
+      PassThrough = DAG.getVectorShuffle(VecVT, dl, PassThrough, PassThrough,
+                                         {7, 7, 7, 7, 7, 7, 7, 7});
+    } else {
+      PassThrough = DAG.getConstant(0, dl, VecVT);
+    }
+    SDValue A, B;
+    if (Opc == ISD::SHL) {
+      A = DAG.getNode(X86ISD::EXPAND, dl, VecVT, Src, PassThrough,
+                      DAG.getBitcast(BoolVT, Mask));
+      B = DAG.getVectorShuffle(VecVT, dl, PassThrough, A,
+                               {7, 8, 9, 10, 11, 12, 13, 14});
+    } else {
+      B = DAG.getNode(X86ISD::COMPRESS, dl, VecVT, Src, PassThrough,
+                      DAG.getBitcast(BoolVT, Mask));
+      A = DAG.getVectorShuffle(VecVT, dl, B, PassThrough,
+                               {1, 2, 3, 4, 5, 6, 7, 8});
+    }
+    // Funnel shifts use modulo shift amount so no need to explicitly mask it.
+    SDValue Res =
+        DAG.getNode(Opc == ISD::SHL ? ISD::FSHL : ISD::FSHR, dl, VecVT, A, B,
+                    DAG.getSplatBuildVector(
+                        VecVT, dl, DAG.getZExtOrTrunc(Amt, dl, MVT::i64)));
+    Results.push_back(DAG.getBitcast(VT, Res));
+    return;
+  }
   case ISD::CTPOP: {
     SDValue N0 = N->getOperand(0);
     EVT VT = N->getValueType(0);
@@ -48076,6 +48169,23 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
             dl, DAG, DCI))
       return V;
 
+  // Scalarize single use funnel shift.
+  // Ideally DAG would handle this similar to scalarizeExtractedBinOp.
+  if (InputVector.getOpcode() == ISD::FSHL ||
+      InputVector.getOpcode() == ISD::FSHR) {
+    if (CIdx && InputVector.hasOneUse() &&
+        TLI.isOperationLegal(InputVector.getOpcode(), VT)) {
+      SDValue LHS = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(0),
+                                            CIdx->getZExtValue());
+      SDValue RHS = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(1),
+                                            CIdx->getZExtValue());
+      SDValue Amt = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(2),
+                                            CIdx->getZExtValue());
+      Amt = DAG.getShiftAmountOperand(VT, Amt);
+      return DAG.getNode(InputVector.getOpcode(), dl, VT, LHS, RHS, Amt);
+    }
+  }
+
   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
   // and then testing the relevant element.
   //
diff --git a/llvm/test/CodeGen/X86/bit-manip-i512.ll b/llvm/test/CodeGen/X86/bit-manip-i512.ll
index ae0243257451a..bc0ca3d799405 100644
--- a/llvm/test/CodeGen/X86/bit-manip-i512.ll
+++ b/llvm/test/CodeGen/X86/bit-manip-i512.ll
@@ -239,330 +239,274 @@ define i512 @bext_i512(i512 %a0, i512 %idx, i512 %len) nounwind {
 ;
 ; AVX512F-LABEL: bext_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    pushq %r15
 ; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %r13
-; AVX512F-NEXT:    pushq %r12
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    subq $168, %rsp
-; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512F-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %eax
-; AVX512F-NEXT:    andl $56, %eax
-; AVX512F-NEXT:    negl %eax
-; AVX512F-NEXT:    movslq %eax, %r10
-; AVX512F-NEXT:    movq 144(%rsp,%r10), %r11
-; AVX512F-NEXT:    movq 152(%rsp,%r10), %rax
-; AVX512F-NEXT:    shldq %cl, %r11, %rax
-; AVX512F-NEXT:    movq 136(%rsp,%r10), %rbx
-; AVX512F-NEXT:    shldq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq 128(%rsp,%r10), %r14
-; AVX512F-NEXT:    shldq %cl, %r14, %rbx
-; AVX512F-NEXT:    movq 120(%rsp,%r10), %r15
-; AVX512F-NEXT:    shldq %cl, %r15, %r14
-; AVX512F-NEXT:    movq 112(%rsp,%r10), %r13
-; AVX512F-NEXT:    shldq %cl, %r13, %r15
-; AVX512F-NEXT:    movq 104(%rsp,%r10), %rbp
-; AVX512F-NEXT:    shldq %cl, %rbp, %r13
-; AVX512F-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    movq 96(%rsp,%r10), %rdx
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT:    shlxq %rcx, %rdx, %r10
-; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT:    shldq %cl, %rdx, %rbp
-; AVX512F-NEXT:    movq %r12, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512F-NEXT:    addq $-1, %r10
-; AVX512F-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    adcq $-1, %rbp
-; AVX512F-NEXT:    adcq $-1, %r13
-; AVX512F-NEXT:    adcq $-1, %r15
-; AVX512F-NEXT:    adcq $-1, %r14
-; AVX512F-NEXT:    adcq $-1, %rbx
-; AVX512F-NEXT:    adcq $-1, %r11
-; AVX512F-NEXT:    adcq $-1, %rax
-; AVX512F-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT:    movl $1, %r10d
+; AVX512F-NEXT:    shlxq %rdi, %r10, %r11
+; AVX512F-NEXT:    shrl $6, %edi
+; AVX512F-NEXT:    shlxq %rdi, %r10, %rdi
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    vpbroadcastq %r11, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vpcmpltuq %zmm2, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %edi
+; AVX512F-NEXT:    vptestnmq %zmm2, %zmm2, %k0
+; AVX512F-NEXT:    kmovw %k0, %r10d
+; AVX512F-NEXT:    movzbl %r10b, %r10d
+; AVX512F-NEXT:    leal (%r10,%rdi,2), %edi
+; AVX512F-NEXT:    xorl %r10d, %edi
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rdi
+; AVX512F-NEXT:    vmovdqu64 %zmm3, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm1, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512F-NEXT:    vmovq %xmm2, %rbx
 ; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovq %xmm0, %r11
 ; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %r9
+; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovq %xmm1, %r8
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rcx
 ; AVX512F-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %r12d, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %r12d
-; AVX512F-NEXT:    andl $56, %r12d
-; AVX512F-NEXT:    movq -80(%rsp,%r12), %rsi
-; AVX512F-NEXT:    movq -88(%rsp,%r12), %rdx
-; AVX512F-NEXT:    movq %rdx, %r8
-; AVX512F-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512F-NEXT:    movq -72(%rsp,%r12), %r9
-; AVX512F-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512F-NEXT:    movq -64(%rsp,%r12), %r10
-; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq -56(%rsp,%r12), %rdi
-; AVX512F-NEXT:    shrdq %cl, %rdi, %r10
-; AVX512F-NEXT:    andq %rbp, %r8
-; AVX512F-NEXT:    andq %r13, %rsi
-; AVX512F-NEXT:    andq %r15, %r9
-; AVX512F-NEXT:    movq -48(%rsp,%r12), %r15
-; AVX512F-NEXT:    shrdq %cl, %r15, %rdi
-; AVX512F-NEXT:    andq %r14, %r10
-; AVX512F-NEXT:    andq %rbx, %rdi
-; AVX512F-NEXT:    movq -96(%rsp,%r12), %rbx
-; AVX512F-NEXT:    movq -40(%rsp,%r12), %r14
-; AVX512F-NEXT:    shrdq %cl, %r14, %r15
-; AVX512F-NEXT:    shrdq %cl, %rdx, %rbx
-; AVX512F-NEXT:    andq %r11, %r15
-; AVX512F-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX512F-NEXT:    shrxq %rcx, %r14, %rcx
-; AVX512F-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512F-NEXT:    movq %rbx, (%rax)
-; AVX512F-NEXT:    movq %r8, 8(%rax)
-; AVX512F-NEXT:    movq %rsi, 16(%rax)
-; AVX512F-NEXT:    movq %r9, 24(%rax)
-; AVX512F-NEXT:    movq %r10, 32(%rax)
+; AVX512F-NEXT:    movl %r14d, %edx
+; AVX512F-NEXT:    andl $63, %edx
+; AVX512F-NEXT:    vmovq %rdx, %xmm1
+; AVX512F-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT:    shrl $3, %r14d
+; AVX512F-NEXT:    andl $56, %r14d
+; AVX512F-NEXT:    vmovdqu64 -128(%rsp,%r14), %zmm2
+; AVX512F-NEXT:    valignq {{.*#+}} zmm3 = zmm2[1,2,3,4,5,6,7],zmm3[0]
+; AVX512F-NEXT:    vpsrlq %xmm1, %zmm2, %zmm2
+; AVX512F-NEXT:    vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT:    vpsllq %xmm1, %zmm3, %zmm1
+; AVX512F-NEXT:    vporq %zmm2, %zmm1, %zmm1
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm1, %xmm2
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512F-NEXT:    andq %rdi, %rdx
+; AVX512F-NEXT:    vmovq %xmm2, %rsi
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rdi
+; AVX512F-NEXT:    andq %rbx, %rsi
+; AVX512F-NEXT:    andq %r10, %rdi
+; AVX512F-NEXT:    vmovq %xmm2, %r10
+; AVX512F-NEXT:    andq %r11, %r10
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %r11
+; AVX512F-NEXT:    andq %r9, %r11
+; AVX512F-NEXT:    vmovq %xmm1, %r9
+; AVX512F-NEXT:    andq %r8, %r9
+; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512F-NEXT:    vmovq %xmm0, %rbx
+; AVX512F-NEXT:    andq %rcx, %r8
+; AVX512F-NEXT:    vmovq %xmm1, %rcx
+; AVX512F-NEXT:    andq %rbx, %rcx
+; AVX512F-NEXT:    movq %rdx, 56(%rax)
+; AVX512F-NEXT:    movq %rsi, 48(%rax)
 ; AVX512F-NEXT:    movq %rdi, 40(%rax)
-; AVX512F-NEXT:    movq %r15, 48(%rax)
-; AVX512F-NEXT:    movq %rcx, 56(%rax)
-; AVX512F-NEXT:    addq $168, %rsp
+; AVX512F-NEXT:    movq %r10, 32(%rax)
+; AVX512F-NEXT:    movq %r11, 8(%rax)
+; AVX512F-NEXT:    movq %r9, (%rax)
+; AVX512F-NEXT:    movq %r8, 24(%rax)
+; AVX512F-NEXT:    movq %rcx, 16(%rax)
+; AVX512F-NEXT:    addq $8, %rsp
 ; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r12
-; AVX512F-NEXT:    popq %r13
 ; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
-; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: bext_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rbp
-; AVX512VL-NEXT:    pushq %r15
 ; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %r13
-; AVX512VL-NEXT:    pushq %r12
 ; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    subq $152, %rsp
-; AVX512VL-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512VL-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %eax
-; AVX512VL-NEXT:    andl $56, %eax
-; AVX512VL-NEXT:    negl %eax
-; AVX512VL-NEXT:    movslq %eax, %r10
-; AVX512VL-NEXT:    movq 128(%rsp,%r10), %r11
-; AVX512VL-NEXT:    movq 136(%rsp,%r10), %rax
-; AVX512VL-NEXT:    shldq %cl, %r11, %rax
-; AVX512VL-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VL-NEXT:    movq 120(%rsp,%r10), %rbx
-; AVX512VL-NEXT:    shldq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq 112(%rsp,%r10), %r14
-; AVX512VL-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq 104(%rsp,%r10), %r15
-; AVX512VL-NEXT:    movq 96(%rsp,%r10), %r12
-; AVX512VL-NEXT:    movq 80(%rsp,%r10), %rax
-; AVX512VL-NEXT:    movq 88(%rsp,%r10), %rbp
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    shldq %cl, %r15, %r14
-; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512VL-NEXT:    shldq %cl, %r12, %r15
-; AVX512VL-NEXT:    shldq %cl, %rbp, %r12
+; AVX512VL-NEXT:    pushq %rax
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    shlxq %rcx, %rax, %r13
-; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT:    shldq %cl, %rax, %rbp
-; AVX512VL-NEXT:    addq $-1, %r13
-; AVX512VL-NEXT:    adcq $-1, %rbp
-; AVX512VL-NEXT:    adcq $-1, %r12
-; AVX512VL-NEXT:    adcq $-1, %r15
-; AVX512VL-NEXT:    adcq $-1, %r14
-; AVX512VL-NEXT:    adcq $-1, %rbx
-; AVX512VL-NEXT:    adcq $-1, %r11
-; AVX512VL-NEXT:    adcq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movl $1, %r11d
+; AVX512VL-NEXT:    shlxq %r10, %r11, %r14
+; AVX512VL-NEXT:    shrl $6, %r10d
+; AVX512VL-NEXT:    shlxq %r10, %r11, %r10
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT:    kmovd %r10d, %k1
+; AVX512VL-NEXT:    vpbroadcastq %r14, %zmm1 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VL-NEXT:    kmovd %k0, %r10d
+; AVX512VL-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512VL-NEXT:    kmovb %k0, %r11d
+; AVX512VL-NEXT:    leal (%r11,%r10,2), %r10d
+; AVX512VL-NEXT:    xorl %r11d, %r10d
+; AVX512VL-NEXT:    kmovd %r10d, %k1
+; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm2, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r11
+; AVX512VL-NEXT:    vmovq %xmm1, %r10
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm2, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vmovdqu %ymm3, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX512VL-NEXT:    vmovdqu %ymm3, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovq %xmm1, %r9
 ; AVX512VL-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512VL-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vpextrq $1, %xmm2, %r8
+; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %r10d, %ecx
+; AVX512VL-NEXT:    movl %edi, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %r10d
-; AVX512VL-NEXT:    andl $56, %r10d
-; AVX512VL-NEXT:    movq -96(%rsp,%r10), %rdx
-; AVX512VL-NEXT:    movq -104(%rsp,%r10), %r8
-; AVX512VL-NEXT:    movq %r8, %rsi
-; AVX512VL-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq -88(%rsp,%r10), %rdi
-; AVX512VL-NEXT:    shrdq %cl, %rdi, %rdx
-; AVX512VL-NEXT:    andq %rbp, %rsi
-; AVX512VL-NEXT:    movq -80(%rsp,%r10), %r9
-; AVX512VL-NEXT:    shrdq %cl, %r9, %rdi
-; AVX512VL-NEXT:    andq %r12, %rdx
-; AVX512VL-NEXT:    movq -72(%rsp,%r10), %r12
-; AVX512VL-NEXT:    shrdq %cl, %r12, %r9
-; AVX512VL-NEXT:    andq %r15, %rdi
-; AVX512VL-NEXT:    movq -64(%rsp,%r10), %r15
-; AVX512VL-NEXT:    shrdq %cl, %r15, %r12
-; AVX512VL-NEXT:    andq %r14, %r9
-; AVX512VL-NEXT:    movq -56(%rsp,%r10), %r14
-; AVX512VL-NEXT:    shrdq %cl, %r14, %r15
-; AVX512VL-NEXT:    andq %rbx, %r12
-; AVX512VL-NEXT:    movq -112(%rsp,%r10), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r8, %r10
-; AVX512VL-NEXT:    andq %r11, %r15
-; AVX512VL-NEXT:    andq %r13, %r10
-; AVX512VL-NEXT:    shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512VL-NEXT:    movq %r10, (%rax)
-; AVX512VL-NEXT:    movq %rsi, 8(%rax)
-; AVX512VL-NEXT:    movq %rdx, 16(%rax)
-; AVX512VL-NEXT:    movq %rdi, 24(%rax)
-; AVX512VL-NEXT:    movq %r9, 32(%rax)
-; AVX512VL-NEXT:    movq %r12, 40(%rax)
-; AVX512VL-NEXT:    movq %r15, 48(%rax)
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT:    shrl $3, %edi
+; AVX512VL-NEXT:    andl $56, %edi
+; AVX512VL-NEXT:    vmovdqu64 -128(%rsp,%rdi), %zmm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm3 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VL-NEXT:    vpsrlq %xmm0, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpaddq %zmm3, %zmm3, %zmm3
+; AVX512VL-NEXT:    vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm3, %zmm0
+; AVX512VL-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512VL-NEXT:    vpextrq $1, %xmm3, %rdx
+; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512VL-NEXT:    vmovq %xmm2, %rsi
+; AVX512VL-NEXT:    andq %r11, %rcx
+; AVX512VL-NEXT:    vmovq %xmm1, %rdi
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r11
+; AVX512VL-NEXT:    andq %r10, %rdi
+; AVX512VL-NEXT:    andq %rbx, %r11
+; AVX512VL-NEXT:    vmovq %xmm1, %r10
+; AVX512VL-NEXT:    andq %r9, %r10
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512VL-NEXT:    andq %r8, %r9
+; AVX512VL-NEXT:    vmovq %xmm0, %r8
+; AVX512VL-NEXT:    andq %rsi, %r8
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    vmovq %xmm3, %rbx
+; AVX512VL-NEXT:    andq %rdx, %rsi
+; AVX512VL-NEXT:    vmovq %xmm0, %rdx
+; AVX512VL-NEXT:    andq %rbx, %rdx
 ; AVX512VL-NEXT:    movq %rcx, 56(%rax)
-; AVX512VL-NEXT:    addq $152, %rsp
+; AVX512VL-NEXT:    movq %rdi, 48(%rax)
+; AVX512VL-NEXT:    movq %r11, 40(%rax)
+; AVX512VL-NEXT:    movq %r10, 32(%rax)
+; AVX512VL-NEXT:    movq %r9, 8(%rax)
+; AVX512VL-NEXT:    movq %r8, (%rax)
+; AVX512VL-NEXT:    movq %rsi, 24(%rax)
+; AVX512VL-NEXT:    movq %rdx, 16(%rax)
+; AVX512VL-NEXT:    addq $8, %rsp
 ; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r12
-; AVX512VL-NEXT:    popq %r13
 ; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
-; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: bext_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rbp
-; AVX512VBMI-NEXT:    pushq %r15
 ; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %r13
-; AVX512VBMI-NEXT:    pushq %r12
 ; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    subq $152, %rsp
-; AVX512VBMI-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512VBMI-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT:    movl %eax, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %eax
-; AVX512VBMI-NEXT:    andl $56, %eax
-; AVX512VBMI-NEXT:    negl %eax
-; AVX512VBMI-NEXT:    movslq %eax, %r10
-; AVX512VBMI-NEXT:    movq 128(%rsp,%r10), %r11
-; AVX512VBMI-NEXT:    movq 136(%rsp,%r10), %rax
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %rax
-; AVX512VBMI-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VBMI-NEXT:    movq 120(%rsp,%r10), %rbx
-; AVX512VBMI-NEXT:    shldq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq 112(%rsp,%r10), %r14
-; AVX512VBMI-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq 104(%rsp,%r10), %r15
-; AVX512VBMI-NEXT:    movq 96(%rsp,%r10), %r12
-; AVX512VBMI-NEXT:    movq 80(%rsp,%r10), %rax
-; AVX512VBMI-NEXT:    movq 88(%rsp,%r10), %rbp
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    shldq %cl, %r15, %r14
-; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1
-; AVX512VBMI-NEXT:    shldq %cl, %r12, %r15
-; AVX512VBMI-NEXT:    shldq %cl, %rbp, %r12
+; AVX512VBMI-NEXT:    pushq %rax
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
 ; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    shlxq %rcx, %rax, %r13
-; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT:    shldq %cl, %rax, %rbp
-; AVX512VBMI-NEXT:    addq $-1, %r13
-; AVX512VBMI-NEXT:    adcq $-1, %rbp
-; AVX512VBMI-NEXT:    adcq $-1, %r12
-; AVX512VBMI-NEXT:    adcq $-1, %r15
-; AVX512VBMI-NEXT:    adcq $-1, %r14
-; AVX512VBMI-NEXT:    adcq $-1, %rbx
-; AVX512VBMI-NEXT:    adcq $-1, %r11
-; AVX512VBMI-NEXT:    adcq $-1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %xmm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movl $1, %r11d
+; AVX512VBMI-NEXT:    shlxq %r10, %r11, %r14
+; AVX512VBMI-NEXT:    shrl $6, %r10d
+; AVX512VBMI-NEXT:    shlxq %r10, %r11, %r10
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX512VBMI-NEXT:    kmovd %r10d, %k1
+; AVX512VBMI-NEXT:    vpbroadcastq %r14, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VBMI-NEXT:    vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VBMI-NEXT:    kmovd %k0, %r10d
+; AVX512VBMI-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512VBMI-NEXT:    kmovb %k0, %r11d
+; AVX512VBMI-NEXT:    leal (%r11,%r10,2), %r10d
+; AVX512VBMI-NEXT:    xorl %r11d, %r10d
+; AVX512VBMI-NEXT:    kmovd %r10d, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VBMI-NEXT:    vextracti32x4 $3, %zmm2, %xmm1
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %r11
+; AVX512VBMI-NEXT:    vmovq %xmm1, %r10
+; AVX512VBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vextracti32x4 $2, %zmm2, %xmm3
+; AVX512VBMI-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm3, %rbx
+; AVX512VBMI-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm2, %r9
 ; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512VBMI-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovq %xmm3, %r8
+; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %r10d, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %r10d
-; AVX512VBMI-NEXT:    andl $56, %r10d
-; AVX512VBMI-NEXT:    movq -96(%rsp,%r10), %rdx
-; AVX512VBMI-NEXT:    movq -104(%rsp,%r10), %r8
-; AVX512VBMI-NEXT:    movq %r8, %rsi
-; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq -88(%rsp,%r10), %rdi
-; AVX512VBMI-NEXT:    shrdq %cl, %rdi, %rdx
-; AVX512VBMI-NEXT:    andq %rbp, %rsi
-; AVX512VBMI-NEXT:    movq -80(%rsp,%r10), %r9
-; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rdi
-; AVX512VBMI-NEXT:    andq %r12, %rdx
-; AVX512VBMI-NEXT:    movq -72(%rsp,%r10), %r12
-; AVX512VBMI-NEXT:    shrdq %cl, %r12, %r9
-; AVX512VBMI-NEXT:    andq %r15, %rdi
-; AVX512VBMI-NEXT:    movq -64(%rsp,%r10), %r15
-; AVX512VBMI-NEXT:    shrdq %cl, %r15, %r12
-; AVX512VBMI-NEXT:    andq %r14, %r9
-; AVX512VBMI-NEXT:    movq -56(%rsp,%r10), %r14
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %r15
-; AVX512VBMI-NEXT:    andq %rbx, %r12
-; AVX512VBMI-NEXT:    movq -112(%rsp,%r10), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r8, %r10
-; AVX512VBMI-NEXT:    andq %r11, %r15
-; AVX512VBMI-NEXT:    andq %r13, %r10
-; AVX512VBMI-NEXT:    shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512VBMI-NEXT:    movq %r10, (%rax)
-; AVX512VBMI-NEXT:    movq %rsi, 8(%rax)
-; AVX512VBMI-NEXT:    movq %rdx, 16(%rax)
-; AVX512VBMI-NEXT:    movq %rdi, 24(%rax)
-; AVX512VBMI-NEXT:    movq %r9, 32(%rax)
-; AVX512VBMI-NEXT:    movq %r12, 40(%rax)
-; AVX512VBMI-NEXT:    movq %r15, 48(%rax)
+; AVX512VBMI-NEXT:    vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX512VBMI-NEXT:    shrl $3, %edi
+; AVX512VBMI-NEXT:    andl $56, %edi
+; AVX512VBMI-NEXT:    vmovdqu64 -128(%rsp,%rdi), %zmm1
+; AVX512VBMI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm3 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm0, %zmm3, %zmm1
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm2, %xmm0
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VBMI-NEXT:    vextracti32x4 $3, %zmm1, %xmm3
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX512VBMI-NEXT:    vmovq %xmm2, %rsi
+; AVX512VBMI-NEXT:    andq %r11, %rcx
+; AVX512VBMI-NEXT:    vmovq %xmm3, %rdi
+; AVX512VBMI-NEXT:    vextracti32x4 $2, %zmm1, %xmm2
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm2, %r11
+; AVX512VBMI-NEXT:    andq %r10, %rdi
+; AVX512VBMI-NEXT:    andq %rbx, %r11
+; AVX512VBMI-NEXT:    vmovq %xmm2, %r10
+; AVX512VBMI-NEXT:    andq %r8, %r10
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512VBMI-NEXT:    andq %r9, %r8
+; AVX512VBMI-NEXT:    vmovq %xmm1, %r9
+; AVX512VBMI-NEXT:    andq %rsi, %r9
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm1, %xmm1
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rbx
+; AVX512VBMI-NEXT:    andq %rdx, %rsi
+; AVX512VBMI-NEXT:    vmovq %xmm1, %rdx
+; AVX512VBMI-NEXT:    andq %rbx, %rdx
 ; AVX512VBMI-NEXT:    movq %rcx, 56(%rax)
-; AVX512VBMI-NEXT:    addq $152, %rsp
+; AVX512VBMI-NEXT:    movq %rdi, 48(%rax)
+; AVX512VBMI-NEXT:    movq %r11, 40(%rax)
+; AVX512VBMI-NEXT:    movq %r10, 32(%rax)
+; AVX512VBMI-NEXT:    movq %r8, 8(%rax)
+; AVX512VBMI-NEXT:    movq %r9, (%rax)
+; AVX512VBMI-NEXT:    movq %rsi, 24(%rax)
+; AVX512VBMI-NEXT:    movq %rdx, 16(%rax)
+; AVX512VBMI-NEXT:    addq $8, %rsp
 ; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r12
-; AVX512VBMI-NEXT:    popq %r13
 ; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
-; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %bit = shl i512 1, %len
@@ -786,292 +730,117 @@ define i512 @bext_i512_vector(<8 x i64> %v0, i512 %idx, i512 %len) nounwind {
 ;
 ; AVX512F-LABEL: bext_i512_vector:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %r13
-; AVX512F-NEXT:    pushq %r12
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    subq $152, %rsp
-; AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %eax
-; AVX512F-NEXT:    andl $56, %eax
-; AVX512F-NEXT:    negl %eax
-; AVX512F-NEXT:    cltq
-; AVX512F-NEXT:    movq (%rsp,%rax), %r8
-; AVX512F-NEXT:    movq 8(%rsp,%rax), %rdx
-; AVX512F-NEXT:    shldq %cl, %r8, %rdx
-; AVX512F-NEXT:    movq -8(%rsp,%rax), %r9
-; AVX512F-NEXT:    shldq %cl, %r9, %r8
-; AVX512F-NEXT:    movq -16(%rsp,%rax), %r10
-; AVX512F-NEXT:    shldq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -24(%rsp,%rax), %rbx
-; AVX512F-NEXT:    shldq %cl, %rbx, %r10
-; AVX512F-NEXT:    movq -32(%rsp,%rax), %r14
-; AVX512F-NEXT:    shldq %cl, %r14, %rbx
-; AVX512F-NEXT:    movq -48(%rsp,%rax), %r11
-; AVX512F-NEXT:    movq -40(%rsp,%rax), %r15
-; AVX512F-NEXT:    shldq %cl, %r15, %r14
-; AVX512F-NEXT:    shldq %cl, %r11, %r15
-; AVX512F-NEXT:    shlxq %rcx, %r11, %rax
-; AVX512F-NEXT:    addq $-1, %rax
-; AVX512F-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    adcq $-1, %r15
-; AVX512F-NEXT:    adcq $-1, %r14
-; AVX512F-NEXT:    adcq $-1, %rbx
-; AVX512F-NEXT:    adcq $-1, %r10
-; AVX512F-NEXT:    adcq $-1, %r9
-; AVX512F-NEXT:    adcq $-1, %r8
-; AVX512F-NEXT:    adcq $-1, %rdx
-; AVX512F-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %esi, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %esi
-; AVX512F-NEXT:    andl $56, %esi
-; AVX512F-NEXT:    movq 32(%rsp,%rsi), %r12
-; AVX512F-NEXT:    movq 24(%rsp,%rsi), %r13
-; AVX512F-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    shrdq %cl, %r12, %r13
-; AVX512F-NEXT:    movq 40(%rsp,%rsi), %rbp
-; AVX512F-NEXT:    shrdq %cl, %rbp, %r12
-; AVX512F-NEXT:    movq 48(%rsp,%rsi), %r11
-; AVX512F-NEXT:    shrdq %cl, %r11, %rbp
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq 56(%rsp,%rsi), %rdi
-; AVX512F-NEXT:    shrdq %cl, %rdi, %r11
-; AVX512F-NEXT:    andq %r15, %r13
-; AVX512F-NEXT:    andq %r14, %r12
-; AVX512F-NEXT:    andq %rbx, %rbp
-; AVX512F-NEXT:    movq 64(%rsp,%rsi), %rbx
-; AVX512F-NEXT:    shrdq %cl, %rbx, %rdi
-; AVX512F-NEXT:    andq %r10, %r11
-; AVX512F-NEXT:    andq %r9, %rdi
-; AVX512F-NEXT:    movq 16(%rsp,%rsi), %r9
-; AVX512F-NEXT:    movq 72(%rsp,%rsi), %rsi
-; AVX512F-NEXT:    shrdq %cl, %rsi, %rbx
-; AVX512F-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    andq %r8, %rbx
-; AVX512F-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512F-NEXT:    shrxq %rcx, %rsi, %rcx
-; AVX512F-NEXT:    andq %rdx, %rcx
-; AVX512F-NEXT:    movq %r9, (%rax)
-; AVX512F-NEXT:    movq %r13, 8(%rax)
-; AVX512F-NEXT:    movq %r12, 16(%rax)
-; AVX512F-NEXT:    movq %rbp, 24(%rax)
-; AVX512F-NEXT:    movq %r11, 32(%rax)
-; AVX512F-NEXT:    movq %rdi, 40(%rax)
-; AVX512F-NEXT:    movq %rbx, 48(%rax)
-; AVX512F-NEXT:    movq %rcx, 56(%rax)
-; AVX512F-NEXT:    addq $152, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r12
-; AVX512F-NEXT:    popq %r13
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT:    movl $1, %edx
+; AVX512F-NEXT:    shlxq %rcx, %rdx, %rdi
+; AVX512F-NEXT:    shrl $6, %ecx
+; AVX512F-NEXT:    shlxq %rcx, %rdx, %rcx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    movl %esi, %ecx
+; AVX512F-NEXT:    vmovq %rcx, %xmm2
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    shrl $6, %esi
+; AVX512F-NEXT:    movl $-1, %ecx
+; AVX512F-NEXT:    shlxl %esi, %ecx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm4, %zmm0, %zmm4
+; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpcmpltuq %zmm1, %zmm3, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    movzbl %dl, %edx
+; AVX512F-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT:    xorl %edx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsllq %xmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm3 & (zmm0 | zmm4)
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: bext_i512_vector:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rbp
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %r13
-; AVX512VL-NEXT:    pushq %r12
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    subq $136, %rsp
-; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm2 = [1,0,0,0]
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %eax
-; AVX512VL-NEXT:    andl $56, %eax
-; AVX512VL-NEXT:    negl %eax
-; AVX512VL-NEXT:    cltq
-; AVX512VL-NEXT:    movq -16(%rsp,%rax), %r8
-; AVX512VL-NEXT:    movq -8(%rsp,%rax), %rdx
-; AVX512VL-NEXT:    shldq %cl, %r8, %rdx
-; AVX512VL-NEXT:    movq -24(%rsp,%rax), %r9
-; AVX512VL-NEXT:    shldq %cl, %r9, %r8
-; AVX512VL-NEXT:    movq -32(%rsp,%rax), %r10
-; AVX512VL-NEXT:    shldq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -40(%rsp,%rax), %rbx
-; AVX512VL-NEXT:    shldq %cl, %rbx, %r10
-; AVX512VL-NEXT:    movq -48(%rsp,%rax), %r14
-; AVX512VL-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -64(%rsp,%rax), %r11
-; AVX512VL-NEXT:    movq -56(%rsp,%rax), %r12
-; AVX512VL-NEXT:    shldq %cl, %r12, %r14
-; AVX512VL-NEXT:    shldq %cl, %r11, %r12
-; AVX512VL-NEXT:    shlxq %rcx, %r11, %r11
-; AVX512VL-NEXT:    addq $-1, %r11
-; AVX512VL-NEXT:    adcq $-1, %r12
-; AVX512VL-NEXT:    adcq $-1, %r14
-; AVX512VL-NEXT:    adcq $-1, %rbx
-; AVX512VL-NEXT:    adcq $-1, %r10
-; AVX512VL-NEXT:    adcq $-1, %r9
-; AVX512VL-NEXT:    adcq $-1, %r8
-; AVX512VL-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    adcq $-1, %rdx
-; AVX512VL-NEXT:    vmovups %ymm0, (%rsp)
-; AVX512VL-NEXT:    movl %esi, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %esi
-; AVX512VL-NEXT:    andl $56, %esi
-; AVX512VL-NEXT:    movq 16(%rsp,%rsi), %r15
-; AVX512VL-NEXT:    movq 8(%rsp,%rsi), %rbp
-; AVX512VL-NEXT:    movq %rbp, %r13
-; AVX512VL-NEXT:    shrdq %cl, %r15, %r13
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq 24(%rsp,%rsi), %rdi
-; AVX512VL-NEXT:    shrdq %cl, %rdi, %r15
-; AVX512VL-NEXT:    andq %r12, %r13
-; AVX512VL-NEXT:    movq 32(%rsp,%rsi), %r12
-; AVX512VL-NEXT:    shrdq %cl, %r12, %rdi
-; AVX512VL-NEXT:    andq %r14, %r15
-; AVX512VL-NEXT:    movq 40(%rsp,%rsi), %r14
-; AVX512VL-NEXT:    shrdq %cl, %r14, %r12
-; AVX512VL-NEXT:    andq %rbx, %rdi
-; AVX512VL-NEXT:    movq 48(%rsp,%rsi), %rbx
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %r14
-; AVX512VL-NEXT:    andq %r10, %r12
-; AVX512VL-NEXT:    movq 56(%rsp,%rsi), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r10, %rbx
-; AVX512VL-NEXT:    andq %r9, %r14
-; AVX512VL-NEXT:    movq (%rsp,%rsi), %rsi
-; AVX512VL-NEXT:    shrdq %cl, %rbp, %rsi
-; AVX512VL-NEXT:    andq %r8, %rbx
-; AVX512VL-NEXT:    andq %r11, %rsi
-; AVX512VL-NEXT:    shrxq %rcx, %r10, %rcx
-; AVX512VL-NEXT:    andq %rdx, %rcx
-; AVX512VL-NEXT:    movq %rsi, (%rax)
-; AVX512VL-NEXT:    movq %r13, 8(%rax)
-; AVX512VL-NEXT:    movq %r15, 16(%rax)
-; AVX512VL-NEXT:    movq %rdi, 24(%rax)
-; AVX512VL-NEXT:    movq %r12, 32(%rax)
-; AVX512VL-NEXT:    movq %r14, 40(%rax)
-; AVX512VL-NEXT:    movq %rbx, 48(%rax)
-; AVX512VL-NEXT:    movq %rcx, 56(%rax)
-; AVX512VL-NEXT:    addq $136, %rsp
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r12
-; AVX512VL-NEXT:    popq %r13
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
-; AVX512VL-NEXT:    popq %rbp
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VL-NEXT:    movl $1, %edx
+; AVX512VL-NEXT:    shlxq %rcx, %rdx, %rdi
+; AVX512VL-NEXT:    shrl $6, %ecx
+; AVX512VL-NEXT:    shlxq %rcx, %rdx, %rcx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VL-NEXT:    kmovd %k0, %ecx
+; AVX512VL-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512VL-NEXT:    kmovb %k0, %edx
+; AVX512VL-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT:    xorl %edx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VL-NEXT:    movl %esi, %ecx
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT:    shrl $6, %esi
+; AVX512VL-NEXT:    movl $-1, %ecx
+; AVX512VL-NEXT:    shlxl %esi, %ecx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpand %xmm1, %xmm3, %xmm4
+; AVX512VL-NEXT:    vpsrlq %xmm4, %zmm0, %zmm4
+; AVX512VL-NEXT:    vpandn %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VL-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm2 & (zmm0 | zmm4)
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: bext_i512_vector:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rbp
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %r13
-; AVX512VBMI-NEXT:    pushq %r12
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    subq $136, %rsp
-; AVX512VBMI-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} xmm2 = [1,0,0,0]
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT:    movl %eax, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %eax
-; AVX512VBMI-NEXT:    andl $56, %eax
-; AVX512VBMI-NEXT:    negl %eax
-; AVX512VBMI-NEXT:    cltq
-; AVX512VBMI-NEXT:    movq -16(%rsp,%rax), %r8
-; AVX512VBMI-NEXT:    movq -8(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT:    shldq %cl, %r8, %rdx
-; AVX512VBMI-NEXT:    movq -24(%rsp,%rax), %r9
-; AVX512VBMI-NEXT:    shldq %cl, %r9, %r8
-; AVX512VBMI-NEXT:    movq -32(%rsp,%rax), %r10
-; AVX512VBMI-NEXT:    shldq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -40(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT:    shldq %cl, %rbx, %r10
-; AVX512VBMI-NEXT:    movq -48(%rsp,%rax), %r14
-; AVX512VBMI-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -64(%rsp,%rax), %r11
-; AVX512VBMI-NEXT:    movq -56(%rsp,%rax), %r12
-; AVX512VBMI-NEXT:    shldq %cl, %r12, %r14
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %r12
-; AVX512VBMI-NEXT:    shlxq %rcx, %r11, %r11
-; AVX512VBMI-NEXT:    addq $-1, %r11
-; AVX512VBMI-NEXT:    adcq $-1, %r12
-; AVX512VBMI-NEXT:    adcq $-1, %r14
-; AVX512VBMI-NEXT:    adcq $-1, %rbx
-; AVX512VBMI-NEXT:    adcq $-1, %r10
-; AVX512VBMI-NEXT:    adcq $-1, %r9
-; AVX512VBMI-NEXT:    adcq $-1, %r8
-; AVX512VBMI-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vextractf64x4 $1, %zmm0, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    adcq $-1, %rdx
-; AVX512VBMI-NEXT:    vmovups %ymm0, (%rsp)
-; AVX512VBMI-NEXT:    movl %esi, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %esi
-; AVX512VBMI-NEXT:    andl $56, %esi
-; AVX512VBMI-NEXT:    movq 16(%rsp,%rsi), %r15
-; AVX512VBMI-NEXT:    movq 8(%rsp,%rsi), %rbp
-; AVX512VBMI-NEXT:    movq %rbp, %r13
-; AVX512VBMI-NEXT:    shrdq %cl, %r15, %r13
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq 24(%rsp,%rsi), %rdi
-; AVX512VBMI-NEXT:    shrdq %cl, %rdi, %r15
-; AVX512VBMI-NEXT:    andq %r12, %r13
-; AVX512VBMI-NEXT:    movq 32(%rsp,%rsi), %r12
-; AVX512VBMI-NEXT:    shrdq %cl, %r12, %rdi
-; AVX512VBMI-NEXT:    andq %r14, %r15
-; AVX512VBMI-NEXT:    movq 40(%rsp,%rsi), %r14
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %r12
-; AVX512VBMI-NEXT:    andq %rbx, %rdi
-; AVX512VBMI-NEXT:    movq 48(%rsp,%rsi), %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r14
-; AVX512VBMI-NEXT:    andq %r10, %r12
-; AVX512VBMI-NEXT:    movq 56(%rsp,%rsi), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r10, %rbx
-; AVX512VBMI-NEXT:    andq %r9, %r14
-; AVX512VBMI-NEXT:    movq (%rsp,%rsi), %rsi
-; AVX512VBMI-NEXT:    shrdq %cl, %rbp, %rsi
-; AVX512VBMI-NEXT:    andq %r8, %rbx
-; AVX512VBMI-NEXT:    andq %r11, %rsi
-; AVX512VBMI-NEXT:    shrxq %rcx, %r10, %rcx
-; AVX512VBMI-NEXT:    andq %rdx, %rcx
-; AVX512VBMI-NEXT:    movq %rsi, (%rax)
-; AVX512VBMI-NEXT:    movq %r13, 8(%rax)
-; AVX512VBMI-NEXT:    movq %r15, 16(%rax)
-; AVX512VBMI-NEXT:    movq %rdi, 24(%rax)
-; AVX512VBMI-NEXT:    movq %r12, 32(%rax)
-; AVX512VBMI-NEXT:    movq %r14, 40(%rax)
-; AVX512VBMI-NEXT:    movq %rbx, 48(%rax)
-; AVX512VBMI-NEXT:    movq %rcx, 56(%rax)
-; AVX512VBMI-NEXT:    addq $136, %rsp
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r12
-; AVX512VBMI-NEXT:    popq %r13
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
-; AVX512VBMI-NEXT:    popq %rbp
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VBMI-NEXT:    movl $1, %edx
+; AVX512VBMI-NEXT:    shlxq %rcx, %rdx, %rdi
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    shlxq %rcx, %rdx, %rcx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VBMI-NEXT:    vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VBMI-NEXT:    kmovd %k0, %ecx
+; AVX512VBMI-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512VBMI-NEXT:    kmovb %k0, %edx
+; AVX512VBMI-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512VBMI-NEXT:    xorl %edx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VBMI-NEXT:    vpbroadcastq %rsi, %zmm1
+; AVX512VBMI-NEXT:    movl %esi, %ecx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    movl $-1, %edx
+; AVX512VBMI-NEXT:    shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm3 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm1, %zmm3, %zmm0
+; AVX512VBMI-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = bitcast <8 x i64> %v0 to i512
@@ -1302,297 +1071,120 @@ define i512 @bext_i512_load(ptr %p0, i512 %idx, i512 %len) nounwind {
 ;
 ; AVX512F-LABEL: bext_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %r13
-; AVX512F-NEXT:    pushq %r12
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    subq $152, %rsp
-; AVX512F-NEXT:    vmovups (%rsi), %zmm0
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovss {{.*#+}} xmm2 = [1,0,0,0]
-; AVX512F-NEXT:    vmovups %zmm2, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movl %eax, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %eax
-; AVX512F-NEXT:    andl $56, %eax
-; AVX512F-NEXT:    negl %eax
-; AVX512F-NEXT:    cltq
-; AVX512F-NEXT:    movq (%rsp,%rax), %r8
-; AVX512F-NEXT:    movq 8(%rsp,%rax), %rsi
-; AVX512F-NEXT:    shldq %cl, %r8, %rsi
-; AVX512F-NEXT:    movq -8(%rsp,%rax), %r9
-; AVX512F-NEXT:    shldq %cl, %r9, %r8
-; AVX512F-NEXT:    movq -16(%rsp,%rax), %r10
-; AVX512F-NEXT:    shldq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -24(%rsp,%rax), %rbx
-; AVX512F-NEXT:    shldq %cl, %rbx, %r10
-; AVX512F-NEXT:    movq -32(%rsp,%rax), %r14
-; AVX512F-NEXT:    shldq %cl, %r14, %rbx
-; AVX512F-NEXT:    movq -48(%rsp,%rax), %r11
-; AVX512F-NEXT:    movq -40(%rsp,%rax), %r15
-; AVX512F-NEXT:    shldq %cl, %r15, %r14
-; AVX512F-NEXT:    shldq %cl, %r11, %r15
-; AVX512F-NEXT:    shlxq %rcx, %r11, %rax
-; AVX512F-NEXT:    addq $-1, %rax
-; AVX512F-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    adcq $-1, %r15
-; AVX512F-NEXT:    adcq $-1, %r14
-; AVX512F-NEXT:    adcq $-1, %rbx
-; AVX512F-NEXT:    adcq $-1, %r10
-; AVX512F-NEXT:    adcq $-1, %r9
-; AVX512F-NEXT:    adcq $-1, %r8
-; AVX512F-NEXT:    adcq $-1, %rsi
-; AVX512F-NEXT:    vmovups %zmm1, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %edx, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %edx
-; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    movq 32(%rsp,%rdx), %r12
-; AVX512F-NEXT:    movq 24(%rsp,%rdx), %r13
-; AVX512F-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    shrdq %cl, %r12, %r13
-; AVX512F-NEXT:    movq 40(%rsp,%rdx), %rbp
-; AVX512F-NEXT:    shrdq %cl, %rbp, %r12
-; AVX512F-NEXT:    movq 48(%rsp,%rdx), %r11
-; AVX512F-NEXT:    shrdq %cl, %r11, %rbp
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq 56(%rsp,%rdx), %rdi
-; AVX512F-NEXT:    shrdq %cl, %rdi, %r11
-; AVX512F-NEXT:    andq %r15, %r13
-; AVX512F-NEXT:    andq %r14, %r12
-; AVX512F-NEXT:    andq %rbx, %rbp
-; AVX512F-NEXT:    movq 64(%rsp,%rdx), %rbx
-; AVX512F-NEXT:    shrdq %cl, %rbx, %rdi
-; AVX512F-NEXT:    andq %r10, %r11
-; AVX512F-NEXT:    andq %r9, %rdi
-; AVX512F-NEXT:    movq 16(%rsp,%rdx), %r9
-; AVX512F-NEXT:    movq 72(%rsp,%rdx), %rdx
-; AVX512F-NEXT:    shrdq %cl, %rdx, %rbx
-; AVX512F-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    andq %r8, %rbx
-; AVX512F-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512F-NEXT:    shrxq %rcx, %rdx, %rcx
-; AVX512F-NEXT:    andq %rsi, %rcx
-; AVX512F-NEXT:    movq %r9, (%rax)
-; AVX512F-NEXT:    movq %r13, 8(%rax)
-; AVX512F-NEXT:    movq %r12, 16(%rax)
-; AVX512F-NEXT:    movq %rbp, 24(%rax)
-; AVX512F-NEXT:    movq %r11, 32(%rax)
-; AVX512F-NEXT:    movq %rdi, 40(%rax)
-; AVX512F-NEXT:    movq %rbx, 48(%rax)
-; AVX512F-NEXT:    movq %rcx, 56(%rax)
-; AVX512F-NEXT:    addq $152, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r12
-; AVX512F-NEXT:    popq %r13
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT:    movl $1, %esi
+; AVX512F-NEXT:    shlxq %rcx, %rsi, %rdi
+; AVX512F-NEXT:    shrl $6, %ecx
+; AVX512F-NEXT:    shlxq %rcx, %rsi, %rcx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    movl %edx, %ecx
+; AVX512F-NEXT:    vmovq %rcx, %xmm2
+; AVX512F-NEXT:    vpbroadcastq %xmm2, %xmm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512F-NEXT:    shrl $6, %edx
+; AVX512F-NEXT:    movl $-1, %ecx
+; AVX512F-NEXT:    shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm4, %zmm0, %zmm4
+; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512F-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpcmpltuq %zmm1, %zmm3, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    movzbl %dl, %edx
+; AVX512F-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT:    xorl %edx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsllq %xmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3 {%k1}
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm3 & (zmm0 | zmm4)
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: bext_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rbp
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %r13
-; AVX512VL-NEXT:    pushq %r12
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    subq $136, %rsp
-; AVX512VL-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VL-NEXT:    vmovups 32(%rsi), %ymm1
-; AVX512VL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm3 = [1,0,0,0]
-; AVX512VL-NEXT:    vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %eax
-; AVX512VL-NEXT:    andl $56, %eax
-; AVX512VL-NEXT:    negl %eax
-; AVX512VL-NEXT:    cltq
-; AVX512VL-NEXT:    movq -16(%rsp,%rax), %r8
-; AVX512VL-NEXT:    movq -8(%rsp,%rax), %rsi
-; AVX512VL-NEXT:    shldq %cl, %r8, %rsi
-; AVX512VL-NEXT:    movq -24(%rsp,%rax), %r9
-; AVX512VL-NEXT:    shldq %cl, %r9, %r8
-; AVX512VL-NEXT:    movq -32(%rsp,%rax), %r10
-; AVX512VL-NEXT:    shldq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -40(%rsp,%rax), %rbx
-; AVX512VL-NEXT:    shldq %cl, %rbx, %r10
-; AVX512VL-NEXT:    movq -48(%rsp,%rax), %r14
-; AVX512VL-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -64(%rsp,%rax), %r11
-; AVX512VL-NEXT:    movq -56(%rsp,%rax), %r12
-; AVX512VL-NEXT:    shldq %cl, %r12, %r14
-; AVX512VL-NEXT:    shldq %cl, %r11, %r12
-; AVX512VL-NEXT:    shlxq %rcx, %r11, %r11
-; AVX512VL-NEXT:    addq $-1, %r11
-; AVX512VL-NEXT:    adcq $-1, %r12
-; AVX512VL-NEXT:    adcq $-1, %r14
-; AVX512VL-NEXT:    adcq $-1, %rbx
-; AVX512VL-NEXT:    adcq $-1, %r10
-; AVX512VL-NEXT:    adcq $-1, %r9
-; AVX512VL-NEXT:    adcq $-1, %r8
-; AVX512VL-NEXT:    vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    adcq $-1, %rsi
-; AVX512VL-NEXT:    vmovups %ymm0, (%rsp)
-; AVX512VL-NEXT:    movl %edx, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %edx
-; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    movq 16(%rsp,%rdx), %r15
-; AVX512VL-NEXT:    movq 8(%rsp,%rdx), %rbp
-; AVX512VL-NEXT:    movq %rbp, %r13
-; AVX512VL-NEXT:    shrdq %cl, %r15, %r13
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq 24(%rsp,%rdx), %rdi
-; AVX512VL-NEXT:    shrdq %cl, %rdi, %r15
-; AVX512VL-NEXT:    andq %r12, %r13
-; AVX512VL-NEXT:    movq 32(%rsp,%rdx), %r12
-; AVX512VL-NEXT:    shrdq %cl, %r12, %rdi
-; AVX512VL-NEXT:    andq %r14, %r15
-; AVX512VL-NEXT:    movq 40(%rsp,%rdx), %r14
-; AVX512VL-NEXT:    shrdq %cl, %r14, %r12
-; AVX512VL-NEXT:    andq %rbx, %rdi
-; AVX512VL-NEXT:    movq 48(%rsp,%rdx), %rbx
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %r14
-; AVX512VL-NEXT:    andq %r10, %r12
-; AVX512VL-NEXT:    movq 56(%rsp,%rdx), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r10, %rbx
-; AVX512VL-NEXT:    andq %r9, %r14
-; AVX512VL-NEXT:    movq (%rsp,%rdx), %rdx
-; AVX512VL-NEXT:    shrdq %cl, %rbp, %rdx
-; AVX512VL-NEXT:    andq %r8, %rbx
-; AVX512VL-NEXT:    andq %r11, %rdx
-; AVX512VL-NEXT:    shrxq %rcx, %r10, %rcx
-; AVX512VL-NEXT:    andq %rsi, %rcx
-; AVX512VL-NEXT:    movq %rdx, (%rax)
-; AVX512VL-NEXT:    movq %r13, 8(%rax)
-; AVX512VL-NEXT:    movq %r15, 16(%rax)
-; AVX512VL-NEXT:    movq %rdi, 24(%rax)
-; AVX512VL-NEXT:    movq %r12, 32(%rax)
-; AVX512VL-NEXT:    movq %r14, 40(%rax)
-; AVX512VL-NEXT:    movq %rbx, 48(%rax)
-; AVX512VL-NEXT:    movq %rcx, 56(%rax)
-; AVX512VL-NEXT:    addq $136, %rsp
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r12
-; AVX512VL-NEXT:    popq %r13
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
-; AVX512VL-NEXT:    popq %rbp
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT:    movl $1, %esi
+; AVX512VL-NEXT:    shlxq %rcx, %rsi, %rdi
+; AVX512VL-NEXT:    shrl $6, %ecx
+; AVX512VL-NEXT:    shlxq %rcx, %rsi, %rcx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VL-NEXT:    kmovd %k0, %ecx
+; AVX512VL-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512VL-NEXT:    kmovb %k0, %esi
+; AVX512VL-NEXT:    leal (%rsi,%rcx,2), %ecx
+; AVX512VL-NEXT:    xorl %esi, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VL-NEXT:    movl %edx, %ecx
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT:    shrl $6, %edx
+; AVX512VL-NEXT:    movl $-1, %ecx
+; AVX512VL-NEXT:    shlxl %edx, %ecx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpand %xmm1, %xmm3, %xmm4
+; AVX512VL-NEXT:    vpsrlq %xmm4, %zmm0, %zmm4
+; AVX512VL-NEXT:    vpandn %xmm1, %xmm3, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VL-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm2 & (zmm0 | zmm4)
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: bext_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rbp
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %r13
-; AVX512VBMI-NEXT:    pushq %r12
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    subq $136, %rsp
-; AVX512VBMI-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT:    vmovups 32(%rsi), %ymm1
-; AVX512VBMI-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} xmm3 = [1,0,0,0]
-; AVX512VBMI-NEXT:    vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT:    movl %eax, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %eax
-; AVX512VBMI-NEXT:    andl $56, %eax
-; AVX512VBMI-NEXT:    negl %eax
-; AVX512VBMI-NEXT:    cltq
-; AVX512VBMI-NEXT:    movq -16(%rsp,%rax), %r8
-; AVX512VBMI-NEXT:    movq -8(%rsp,%rax), %rsi
-; AVX512VBMI-NEXT:    shldq %cl, %r8, %rsi
-; AVX512VBMI-NEXT:    movq -24(%rsp,%rax), %r9
-; AVX512VBMI-NEXT:    shldq %cl, %r9, %r8
-; AVX512VBMI-NEXT:    movq -32(%rsp,%rax), %r10
-; AVX512VBMI-NEXT:    shldq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -40(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT:    shldq %cl, %rbx, %r10
-; AVX512VBMI-NEXT:    movq -48(%rsp,%rax), %r14
-; AVX512VBMI-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -64(%rsp,%rax), %r11
-; AVX512VBMI-NEXT:    movq -56(%rsp,%rax), %r12
-; AVX512VBMI-NEXT:    shldq %cl, %r12, %r14
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %r12
-; AVX512VBMI-NEXT:    shlxq %rcx, %r11, %r11
-; AVX512VBMI-NEXT:    addq $-1, %r11
-; AVX512VBMI-NEXT:    adcq $-1, %r12
-; AVX512VBMI-NEXT:    adcq $-1, %r14
-; AVX512VBMI-NEXT:    adcq $-1, %rbx
-; AVX512VBMI-NEXT:    adcq $-1, %r10
-; AVX512VBMI-NEXT:    adcq $-1, %r9
-; AVX512VBMI-NEXT:    adcq $-1, %r8
-; AVX512VBMI-NEXT:    vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    adcq $-1, %rsi
-; AVX512VBMI-NEXT:    vmovups %ymm0, (%rsp)
-; AVX512VBMI-NEXT:    movl %edx, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %edx
-; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    movq 16(%rsp,%rdx), %r15
-; AVX512VBMI-NEXT:    movq 8(%rsp,%rdx), %rbp
-; AVX512VBMI-NEXT:    movq %rbp, %r13
-; AVX512VBMI-NEXT:    shrdq %cl, %r15, %r13
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq 24(%rsp,%rdx), %rdi
-; AVX512VBMI-NEXT:    shrdq %cl, %rdi, %r15
-; AVX512VBMI-NEXT:    andq %r12, %r13
-; AVX512VBMI-NEXT:    movq 32(%rsp,%rdx), %r12
-; AVX512VBMI-NEXT:    shrdq %cl, %r12, %rdi
-; AVX512VBMI-NEXT:    andq %r14, %r15
-; AVX512VBMI-NEXT:    movq 40(%rsp,%rdx), %r14
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %r12
-; AVX512VBMI-NEXT:    andq %rbx, %rdi
-; AVX512VBMI-NEXT:    movq 48(%rsp,%rdx), %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r14
-; AVX512VBMI-NEXT:    andq %r10, %r12
-; AVX512VBMI-NEXT:    movq 56(%rsp,%rdx), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r10, %rbx
-; AVX512VBMI-NEXT:    andq %r9, %r14
-; AVX512VBMI-NEXT:    movq (%rsp,%rdx), %rdx
-; AVX512VBMI-NEXT:    shrdq %cl, %rbp, %rdx
-; AVX512VBMI-NEXT:    andq %r8, %rbx
-; AVX512VBMI-NEXT:    andq %r11, %rdx
-; AVX512VBMI-NEXT:    shrxq %rcx, %r10, %rcx
-; AVX512VBMI-NEXT:    andq %rsi, %rcx
-; AVX512VBMI-NEXT:    movq %rdx, (%rax)
-; AVX512VBMI-NEXT:    movq %r13, 8(%rax)
-; AVX512VBMI-NEXT:    movq %r15, 16(%rax)
-; AVX512VBMI-NEXT:    movq %rdi, 24(%rax)
-; AVX512VBMI-NEXT:    movq %r12, 32(%rax)
-; AVX512VBMI-NEXT:    movq %r14, 40(%rax)
-; AVX512VBMI-NEXT:    movq %rbx, 48(%rax)
-; AVX512VBMI-NEXT:    movq %rcx, 56(%rax)
-; AVX512VBMI-NEXT:    addq $136, %rsp
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r12
-; AVX512VBMI-NEXT:    popq %r13
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
-; AVX512VBMI-NEXT:    popq %rbp
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT:    movl $1, %esi
+; AVX512VBMI-NEXT:    shlxq %rcx, %rsi, %rdi
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    shlxq %rcx, %rsi, %rcx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VBMI-NEXT:    vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VBMI-NEXT:    kmovd %k0, %ecx
+; AVX512VBMI-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512VBMI-NEXT:    kmovb %k0, %esi
+; AVX512VBMI-NEXT:    leal (%rsi,%rcx,2), %ecx
+; AVX512VBMI-NEXT:    xorl %esi, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VBMI-NEXT:    vpbroadcastq %rdx, %zmm1
+; AVX512VBMI-NEXT:    movl %edx, %ecx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    movl $-1, %edx
+; AVX512VBMI-NEXT:    shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm3 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm1, %zmm3, %zmm0
+; AVX512VBMI-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -3924,220 +3516,181 @@ define i512 @bzhi_i512(i512 %a0, i512 %idx) nounwind {
 ;
 ; AVX512F-LABEL: bzhi_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
 ; AVX512F-NEXT:    pushq %r15
 ; AVX512F-NEXT:    pushq %r14
 ; AVX512F-NEXT:    pushq %r13
 ; AVX512F-NEXT:    pushq %r12
 ; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, %rax
-; AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    movl %r10d, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %r10d
-; AVX512F-NEXT:    andl $56, %r10d
-; AVX512F-NEXT:    negl %r10d
-; AVX512F-NEXT:    movslq %r10d, %r13
-; AVX512F-NEXT:    movq -16(%rsp,%r13), %r11
-; AVX512F-NEXT:    movq -8(%rsp,%r13), %r10
-; AVX512F-NEXT:    shldq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -24(%rsp,%r13), %rbx
-; AVX512F-NEXT:    shldq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -32(%rsp,%r13), %r14
-; AVX512F-NEXT:    shldq %cl, %r14, %rbx
-; AVX512F-NEXT:    movq -40(%rsp,%r13), %r15
-; AVX512F-NEXT:    shldq %cl, %r15, %r14
-; AVX512F-NEXT:    movq -48(%rsp,%r13), %r12
-; AVX512F-NEXT:    shldq %cl, %r12, %r15
-; AVX512F-NEXT:    movq -64(%rsp,%r13), %rbp
-; AVX512F-NEXT:    movq -56(%rsp,%r13), %r13
-; AVX512F-NEXT:    shldq %cl, %r13, %r12
-; AVX512F-NEXT:    shldq %cl, %rbp, %r13
-; AVX512F-NEXT:    shlxq %rcx, %rbp, %rcx
-; AVX512F-NEXT:    addq $-1, %rcx
-; AVX512F-NEXT:    adcq $-1, %r13
-; AVX512F-NEXT:    adcq $-1, %r12
-; AVX512F-NEXT:    adcq $-1, %r15
-; AVX512F-NEXT:    adcq $-1, %r14
-; AVX512F-NEXT:    adcq $-1, %rbx
-; AVX512F-NEXT:    adcq $-1, %r11
-; AVX512F-NEXT:    adcq $-1, %r10
-; AVX512F-NEXT:    andq %r9, %r14
-; AVX512F-NEXT:    andq %r8, %r15
-; AVX512F-NEXT:    andq %rax, %r12
-; AVX512F-NEXT:    andq %rdx, %r13
-; AVX512F-NEXT:    andq %rsi, %rcx
-; AVX512F-NEXT:    movq %rcx, (%rdi)
-; AVX512F-NEXT:    movq %r13, 8(%rdi)
-; AVX512F-NEXT:    movq %r12, 16(%rdi)
-; AVX512F-NEXT:    movq %r15, 24(%rdi)
-; AVX512F-NEXT:    andq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT:    movq %r14, 32(%rdi)
-; AVX512F-NEXT:    movq %rbx, 40(%rdi)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movl $1, %r10d
+; AVX512F-NEXT:    shlxq %rax, %r10, %r11
+; AVX512F-NEXT:    shrl $6, %eax
+; AVX512F-NEXT:    shlxq %rax, %r10, %rax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpbroadcastq %r11, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
+; AVX512F-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vpcmpltuq %zmm1, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %r10d
+; AVX512F-NEXT:    movzbl %r10b, %r10d
+; AVX512F-NEXT:    leal (%r10,%rax,2), %eax
+; AVX512F-NEXT:    xorl %r10d, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %r10
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rdi
+; AVX512F-NEXT:    vmovq %xmm1, %r11
+; AVX512F-NEXT:    vmovq %xmm2, %rbx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %r14
+; AVX512F-NEXT:    vmovq %xmm0, %r15
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %r12
+; AVX512F-NEXT:    vmovq %xmm0, %r13
+; AVX512F-NEXT:    andq %rcx, %r13
+; AVX512F-NEXT:    andq %r8, %r12
+; AVX512F-NEXT:    andq %rsi, %r15
+; AVX512F-NEXT:    andq %rdx, %r14
+; AVX512F-NEXT:    andq %r9, %rbx
+; AVX512F-NEXT:    andq {{[0-9]+}}(%rsp), %rdi
 ; AVX512F-NEXT:    andq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT:    movq %r11, 48(%rdi)
 ; AVX512F-NEXT:    andq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    movq %r10, 56(%rdi)
-; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    movq %r10, 56(%rax)
+; AVX512F-NEXT:    movq %r11, 48(%rax)
+; AVX512F-NEXT:    movq %rdi, 40(%rax)
+; AVX512F-NEXT:    movq %rbx, 32(%rax)
+; AVX512F-NEXT:    movq %r14, 8(%rax)
+; AVX512F-NEXT:    movq %r15, (%rax)
+; AVX512F-NEXT:    movq %r12, 24(%rax)
+; AVX512F-NEXT:    movq %r13, 16(%rax)
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r12
 ; AVX512F-NEXT:    popq %r13
 ; AVX512F-NEXT:    popq %r14
 ; AVX512F-NEXT:    popq %r15
-; AVX512F-NEXT:    popq %rbp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: bzhi_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rbp
 ; AVX512VL-NEXT:    pushq %r15
 ; AVX512VL-NEXT:    pushq %r14
 ; AVX512VL-NEXT:    pushq %r13
 ; AVX512VL-NEXT:    pushq %r12
 ; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    movq %rcx, %rax
-; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movl %r10d, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %r10d
-; AVX512VL-NEXT:    andl $56, %r10d
-; AVX512VL-NEXT:    negl %r10d
-; AVX512VL-NEXT:    movslq %r10d, %rbp
-; AVX512VL-NEXT:    movq -16(%rsp,%rbp), %r11
-; AVX512VL-NEXT:    movq -8(%rsp,%rbp), %r10
-; AVX512VL-NEXT:    shldq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -24(%rsp,%rbp), %rbx
-; AVX512VL-NEXT:    shldq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -32(%rsp,%rbp), %r14
-; AVX512VL-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -40(%rsp,%rbp), %r15
-; AVX512VL-NEXT:    shldq %cl, %r15, %r14
-; AVX512VL-NEXT:    movq -48(%rsp,%rbp), %r12
-; AVX512VL-NEXT:    shldq %cl, %r12, %r15
-; AVX512VL-NEXT:    movq -56(%rsp,%rbp), %r13
-; AVX512VL-NEXT:    shldq %cl, %r13, %r12
-; AVX512VL-NEXT:    movq -64(%rsp,%rbp), %rbp
-; AVX512VL-NEXT:    shldq %cl, %rbp, %r13
-; AVX512VL-NEXT:    shlxq %rcx, %rbp, %rcx
-; AVX512VL-NEXT:    addq $-1, %rcx
-; AVX512VL-NEXT:    adcq $-1, %r13
-; AVX512VL-NEXT:    adcq $-1, %r12
-; AVX512VL-NEXT:    adcq $-1, %r15
-; AVX512VL-NEXT:    adcq $-1, %r14
-; AVX512VL-NEXT:    adcq $-1, %rbx
-; AVX512VL-NEXT:    adcq $-1, %r11
-; AVX512VL-NEXT:    adcq $-1, %r10
-; AVX512VL-NEXT:    andq %r9, %r14
-; AVX512VL-NEXT:    andq %r8, %r15
-; AVX512VL-NEXT:    andq %rax, %r12
-; AVX512VL-NEXT:    andq %rdx, %r13
-; AVX512VL-NEXT:    andq %rsi, %rcx
-; AVX512VL-NEXT:    movq %rcx, (%rdi)
-; AVX512VL-NEXT:    movq %r13, 8(%rdi)
-; AVX512VL-NEXT:    movq %r12, 16(%rdi)
-; AVX512VL-NEXT:    movq %r15, 24(%rdi)
-; AVX512VL-NEXT:    movq %r14, 32(%rdi)
-; AVX512VL-NEXT:    andq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT:    movq %rbx, 40(%rdi)
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT:    movl $1, %r10d
+; AVX512VL-NEXT:    shlxq %rdi, %r10, %r11
+; AVX512VL-NEXT:    shrl $6, %edi
+; AVX512VL-NEXT:    shlxq %rdi, %r10, %rdi
+; AVX512VL-NEXT:    kmovd %edi, %k1
+; AVX512VL-NEXT:    vpbroadcastq %r11, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT:    kmovd %k0, %edi
+; AVX512VL-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    kmovb %k0, %r10d
+; AVX512VL-NEXT:    leal (%r10,%rdi,2), %edi
+; AVX512VL-NEXT:    xorl %r10d, %edi
+; AVX512VL-NEXT:    kmovd %edi, %k1
+; AVX512VL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512VL-NEXT:    vmovq %xmm0, %r10
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm1, %xmm0
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX512VL-NEXT:    vmovq %xmm0, %rbx
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX512VL-NEXT:    vmovq %xmm1, %r15
+; AVX512VL-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %r12
+; AVX512VL-NEXT:    vmovq %xmm0, %r13
+; AVX512VL-NEXT:    andq %rcx, %r13
+; AVX512VL-NEXT:    andq %r8, %r12
+; AVX512VL-NEXT:    andq %rsi, %r15
+; AVX512VL-NEXT:    andq %rdx, %r14
+; AVX512VL-NEXT:    andq %r9, %rbx
 ; AVX512VL-NEXT:    andq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT:    movq %r11, 48(%rdi)
 ; AVX512VL-NEXT:    andq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq %r10, 56(%rdi)
-; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    addq $8, %rsp
+; AVX512VL-NEXT:    andq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT:    movq %rdi, 56(%rax)
+; AVX512VL-NEXT:    movq %r10, 48(%rax)
+; AVX512VL-NEXT:    movq %r11, 40(%rax)
+; AVX512VL-NEXT:    movq %rbx, 32(%rax)
+; AVX512VL-NEXT:    movq %r14, 8(%rax)
+; AVX512VL-NEXT:    movq %r15, (%rax)
+; AVX512VL-NEXT:    movq %r12, 24(%rax)
+; AVX512VL-NEXT:    movq %r13, 16(%rax)
 ; AVX512VL-NEXT:    popq %rbx
 ; AVX512VL-NEXT:    popq %r12
 ; AVX512VL-NEXT:    popq %r13
 ; AVX512VL-NEXT:    popq %r14
 ; AVX512VL-NEXT:    popq %r15
-; AVX512VL-NEXT:    popq %rbp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: bzhi_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rbp
 ; AVX512VBMI-NEXT:    pushq %r15
 ; AVX512VBMI-NEXT:    pushq %r14
 ; AVX512VBMI-NEXT:    pushq %r13
 ; AVX512VBMI-NEXT:    pushq %r12
 ; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    movq %rcx, %rax
-; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    movl %r10d, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %r10d
-; AVX512VBMI-NEXT:    andl $56, %r10d
-; AVX512VBMI-NEXT:    negl %r10d
-; AVX512VBMI-NEXT:    movslq %r10d, %rbp
-; AVX512VBMI-NEXT:    movq -16(%rsp,%rbp), %r11
-; AVX512VBMI-NEXT:    movq -8(%rsp,%rbp), %r10
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -24(%rsp,%rbp), %rbx
-; AVX512VBMI-NEXT:    shldq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -32(%rsp,%rbp), %r14
-; AVX512VBMI-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -40(%rsp,%rbp), %r15
-; AVX512VBMI-NEXT:    shldq %cl, %r15, %r14
-; AVX512VBMI-NEXT:    movq -48(%rsp,%rbp), %r12
-; AVX512VBMI-NEXT:    shldq %cl, %r12, %r15
-; AVX512VBMI-NEXT:    movq -56(%rsp,%rbp), %r13
-; AVX512VBMI-NEXT:    shldq %cl, %r13, %r12
-; AVX512VBMI-NEXT:    movq -64(%rsp,%rbp), %rbp
-; AVX512VBMI-NEXT:    shldq %cl, %rbp, %r13
-; AVX512VBMI-NEXT:    shlxq %rcx, %rbp, %rcx
-; AVX512VBMI-NEXT:    addq $-1, %rcx
-; AVX512VBMI-NEXT:    adcq $-1, %r13
-; AVX512VBMI-NEXT:    adcq $-1, %r12
-; AVX512VBMI-NEXT:    adcq $-1, %r15
-; AVX512VBMI-NEXT:    adcq $-1, %r14
-; AVX512VBMI-NEXT:    adcq $-1, %rbx
-; AVX512VBMI-NEXT:    adcq $-1, %r11
-; AVX512VBMI-NEXT:    adcq $-1, %r10
-; AVX512VBMI-NEXT:    andq %r9, %r14
-; AVX512VBMI-NEXT:    andq %r8, %r15
-; AVX512VBMI-NEXT:    andq %rax, %r12
-; AVX512VBMI-NEXT:    andq %rdx, %r13
-; AVX512VBMI-NEXT:    andq %rsi, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, (%rdi)
-; AVX512VBMI-NEXT:    movq %r13, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %r12, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %r15, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %r14, 32(%rdi)
-; AVX512VBMI-NEXT:    andq {{[0-9]+}}(%rsp), %rbx
-; AVX512VBMI-NEXT:    movq %rbx, 40(%rdi)
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT:    movl $1, %r10d
+; AVX512VBMI-NEXT:    shlxq %rdi, %r10, %r11
+; AVX512VBMI-NEXT:    shrl $6, %edi
+; AVX512VBMI-NEXT:    shlxq %rdi, %r10, %rdi
+; AVX512VBMI-NEXT:    kmovd %edi, %k1
+; AVX512VBMI-NEXT:    vpbroadcastq %r11, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VBMI-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VBMI-NEXT:    kmovd %k0, %edi
+; AVX512VBMI-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT:    kmovb %k0, %r10d
+; AVX512VBMI-NEXT:    leal (%r10,%rdi,2), %edi
+; AVX512VBMI-NEXT:    xorl %r10d, %edi
+; AVX512VBMI-NEXT:    kmovd %edi, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512VBMI-NEXT:    vmovq %xmm0, %r10
+; AVX512VBMI-NEXT:    vextracti32x4 $2, %zmm1, %xmm0
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %r11
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rbx
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX512VBMI-NEXT:    vmovq %xmm1, %r15
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %r12
+; AVX512VBMI-NEXT:    vmovq %xmm0, %r13
+; AVX512VBMI-NEXT:    andq %rcx, %r13
+; AVX512VBMI-NEXT:    andq %r8, %r12
+; AVX512VBMI-NEXT:    andq %rsi, %r15
+; AVX512VBMI-NEXT:    andq %rdx, %r14
+; AVX512VBMI-NEXT:    andq %r9, %rbx
 ; AVX512VBMI-NEXT:    andq {{[0-9]+}}(%rsp), %r11
-; AVX512VBMI-NEXT:    movq %r11, 48(%rdi)
 ; AVX512VBMI-NEXT:    andq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    movq %r10, 56(%rdi)
-; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    addq $8, %rsp
+; AVX512VBMI-NEXT:    andq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT:    movq %rdi, 56(%rax)
+; AVX512VBMI-NEXT:    movq %r10, 48(%rax)
+; AVX512VBMI-NEXT:    movq %r11, 40(%rax)
+; AVX512VBMI-NEXT:    movq %rbx, 32(%rax)
+; AVX512VBMI-NEXT:    movq %r14, 8(%rax)
+; AVX512VBMI-NEXT:    movq %r15, (%rax)
+; AVX512VBMI-NEXT:    movq %r12, 24(%rax)
+; AVX512VBMI-NEXT:    movq %r13, 16(%rax)
 ; AVX512VBMI-NEXT:    popq %rbx
 ; AVX512VBMI-NEXT:    popq %r12
 ; AVX512VBMI-NEXT:    popq %r13
 ; AVX512VBMI-NEXT:    popq %r14
 ; AVX512VBMI-NEXT:    popq %r15
-; AVX512VBMI-NEXT:    popq %rbp
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %bit = shl i512 1, %idx
@@ -4379,223 +3932,73 @@ define i512 @bzhi_i512_vector(<8 x i64> %v0, i512 %idx) nounwind {
 ;
 ; AVX512F-LABEL: bzhi_i512_vector:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [1,0,0,0]
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %esi, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %esi
-; AVX512F-NEXT:    andl $56, %esi
-; AVX512F-NEXT:    negl %esi
-; AVX512F-NEXT:    movslq %esi, %r14
-; AVX512F-NEXT:    movq -16(%rsp,%r14), %rdx
-; AVX512F-NEXT:    movq -8(%rsp,%r14), %rsi
-; AVX512F-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -24(%rsp,%r14), %r8
-; AVX512F-NEXT:    shldq %cl, %r8, %rdx
-; AVX512F-NEXT:    movq -32(%rsp,%r14), %r9
-; AVX512F-NEXT:    shldq %cl, %r9, %r8
-; AVX512F-NEXT:    movq -40(%rsp,%r14), %r10
-; AVX512F-NEXT:    shldq %cl, %r10, %r9
-; AVX512F-NEXT:    vpextrq $1, %xmm0, %rbx
-; AVX512F-NEXT:    movq -48(%rsp,%r14), %r11
-; AVX512F-NEXT:    shldq %cl, %r11, %r10
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512F-NEXT:    movq -64(%rsp,%r14), %r15
-; AVX512F-NEXT:    movq -56(%rsp,%r14), %rdi
-; AVX512F-NEXT:    shldq %cl, %rdi, %r11
-; AVX512F-NEXT:    shldq %cl, %r15, %rdi
-; AVX512F-NEXT:    shlxq %rcx, %r15, %rcx
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512F-NEXT:    addq $-1, %rcx
-; AVX512F-NEXT:    adcq $-1, %rdi
-; AVX512F-NEXT:    adcq $-1, %r11
-; AVX512F-NEXT:    adcq $-1, %r10
-; AVX512F-NEXT:    adcq $-1, %r9
-; AVX512F-NEXT:    adcq $-1, %r8
-; AVX512F-NEXT:    adcq $-1, %rdx
-; AVX512F-NEXT:    adcq $-1, %rsi
-; AVX512F-NEXT:    andq %r14, %rsi
-; AVX512F-NEXT:    vmovq %xmm1, %r14
-; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT:    andq %r14, %rdx
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512F-NEXT:    andq %r14, %r8
-; AVX512F-NEXT:    vmovq %xmm1, %r14
-; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512F-NEXT:    andq %r14, %r9
-; AVX512F-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512F-NEXT:    andq %r14, %r10
-; AVX512F-NEXT:    vmovq %xmm1, %r14
-; AVX512F-NEXT:    andq %r14, %r11
-; AVX512F-NEXT:    andq %rbx, %rdi
-; AVX512F-NEXT:    vmovq %xmm0, %rbx
-; AVX512F-NEXT:    andq %rbx, %rcx
-; AVX512F-NEXT:    movq %rcx, (%rax)
-; AVX512F-NEXT:    movq %rdi, 8(%rax)
-; AVX512F-NEXT:    movq %r11, 16(%rax)
-; AVX512F-NEXT:    movq %r10, 24(%rax)
-; AVX512F-NEXT:    movq %r9, 32(%rax)
-; AVX512F-NEXT:    movq %r8, 40(%rax)
-; AVX512F-NEXT:    movq %rdx, 48(%rax)
-; AVX512F-NEXT:    movq %rsi, 56(%rax)
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    movl $1, %ecx
+; AVX512F-NEXT:    shlxq %rsi, %rcx, %rdx
+; AVX512F-NEXT:    shrl $6, %esi
+; AVX512F-NEXT:    shlxq %rsi, %rcx, %rcx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpbroadcastq %rdx, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512F-NEXT:    vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    movzbl %dl, %edx
+; AVX512F-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT:    xorl %edx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512F-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: bzhi_i512_vector:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0]
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %esi, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %esi
-; AVX512VL-NEXT:    andl $56, %esi
-; AVX512VL-NEXT:    negl %esi
-; AVX512VL-NEXT:    movslq %esi, %r14
-; AVX512VL-NEXT:    movq -16(%rsp,%r14), %rsi
-; AVX512VL-NEXT:    movq -8(%rsp,%r14), %r8
-; AVX512VL-NEXT:    shldq %cl, %rsi, %r8
-; AVX512VL-NEXT:    movq -24(%rsp,%r14), %r9
-; AVX512VL-NEXT:    shldq %cl, %r9, %rsi
-; AVX512VL-NEXT:    movq -32(%rsp,%r14), %r10
-; AVX512VL-NEXT:    shldq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -40(%rsp,%r14), %r11
-; AVX512VL-NEXT:    shldq %cl, %r11, %r10
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq -48(%rsp,%r14), %rdi
-; AVX512VL-NEXT:    shldq %cl, %rdi, %r11
-; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VL-NEXT:    movq -56(%rsp,%r14), %rbx
-; AVX512VL-NEXT:    shldq %cl, %rbx, %rdi
-; AVX512VL-NEXT:    movq -64(%rsp,%r14), %r14
-; AVX512VL-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VL-NEXT:    shlxq %rcx, %r14, %rcx
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512VL-NEXT:    addq $-1, %rcx
-; AVX512VL-NEXT:    adcq $-1, %rbx
-; AVX512VL-NEXT:    adcq $-1, %rdi
-; AVX512VL-NEXT:    adcq $-1, %r11
-; AVX512VL-NEXT:    adcq $-1, %r10
-; AVX512VL-NEXT:    adcq $-1, %r9
-; AVX512VL-NEXT:    adcq $-1, %rsi
-; AVX512VL-NEXT:    adcq $-1, %r8
-; AVX512VL-NEXT:    andq %r14, %r8
-; AVX512VL-NEXT:    vmovq %xmm1, %r14
-; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VL-NEXT:    andq %r14, %rsi
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512VL-NEXT:    andq %r14, %r9
-; AVX512VL-NEXT:    vmovq %xmm1, %r14
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    andq %r14, %r10
-; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512VL-NEXT:    andq %r14, %r11
-; AVX512VL-NEXT:    vmovq %xmm1, %r14
-; AVX512VL-NEXT:    andq %r14, %rdi
-; AVX512VL-NEXT:    andq %rdx, %rbx
-; AVX512VL-NEXT:    vmovq %xmm0, %rdx
-; AVX512VL-NEXT:    andq %rdx, %rcx
-; AVX512VL-NEXT:    movq %rcx, (%rax)
-; AVX512VL-NEXT:    movq %rbx, 8(%rax)
-; AVX512VL-NEXT:    movq %rdi, 16(%rax)
-; AVX512VL-NEXT:    movq %r11, 24(%rax)
-; AVX512VL-NEXT:    movq %r10, 32(%rax)
-; AVX512VL-NEXT:    movq %r9, 40(%rax)
-; AVX512VL-NEXT:    movq %rsi, 48(%rax)
-; AVX512VL-NEXT:    movq %r8, 56(%rax)
-; AVX512VL-NEXT:    addq $8, %rsp
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    vzeroupper
-; AVX512VL-NEXT:    retq
-;
-; AVX512VBMI-LABEL: bzhi_i512_vector:
-; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %rdx
-; AVX512VBMI-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} xmm1 = [1,0]
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %esi, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %esi
-; AVX512VBMI-NEXT:    andl $56, %esi
-; AVX512VBMI-NEXT:    negl %esi
-; AVX512VBMI-NEXT:    movslq %esi, %r14
-; AVX512VBMI-NEXT:    movq -16(%rsp,%r14), %rsi
-; AVX512VBMI-NEXT:    movq -8(%rsp,%r14), %r8
-; AVX512VBMI-NEXT:    shldq %cl, %rsi, %r8
-; AVX512VBMI-NEXT:    movq -24(%rsp,%r14), %r9
-; AVX512VBMI-NEXT:    shldq %cl, %r9, %rsi
-; AVX512VBMI-NEXT:    movq -32(%rsp,%r14), %r10
-; AVX512VBMI-NEXT:    shldq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -40(%rsp,%r14), %r11
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq -48(%rsp,%r14), %rdi
-; AVX512VBMI-NEXT:    shldq %cl, %rdi, %r11
-; AVX512VBMI-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VBMI-NEXT:    movq -56(%rsp,%r14), %rbx
-; AVX512VBMI-NEXT:    shldq %cl, %rbx, %rdi
-; AVX512VBMI-NEXT:    movq -64(%rsp,%r14), %r14
-; AVX512VBMI-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    shlxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512VBMI-NEXT:    addq $-1, %rcx
-; AVX512VBMI-NEXT:    adcq $-1, %rbx
-; AVX512VBMI-NEXT:    adcq $-1, %rdi
-; AVX512VBMI-NEXT:    adcq $-1, %r11
-; AVX512VBMI-NEXT:    adcq $-1, %r10
-; AVX512VBMI-NEXT:    adcq $-1, %r9
-; AVX512VBMI-NEXT:    adcq $-1, %rsi
-; AVX512VBMI-NEXT:    adcq $-1, %r8
-; AVX512VBMI-NEXT:    andq %r14, %r8
-; AVX512VBMI-NEXT:    vmovq %xmm1, %r14
-; AVX512VBMI-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VBMI-NEXT:    andq %r14, %rsi
-; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512VBMI-NEXT:    andq %r14, %r9
-; AVX512VBMI-NEXT:    vmovq %xmm1, %r14
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VBMI-NEXT:    andq %r14, %r10
-; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %r14
-; AVX512VBMI-NEXT:    andq %r14, %r11
-; AVX512VBMI-NEXT:    vmovq %xmm1, %r14
-; AVX512VBMI-NEXT:    andq %r14, %rdi
-; AVX512VBMI-NEXT:    andq %rdx, %rbx
-; AVX512VBMI-NEXT:    vmovq %xmm0, %rdx
-; AVX512VBMI-NEXT:    andq %rdx, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, (%rax)
-; AVX512VBMI-NEXT:    movq %rbx, 8(%rax)
-; AVX512VBMI-NEXT:    movq %rdi, 16(%rax)
-; AVX512VBMI-NEXT:    movq %r11, 24(%rax)
-; AVX512VBMI-NEXT:    movq %r10, 32(%rax)
-; AVX512VBMI-NEXT:    movq %r9, 40(%rax)
-; AVX512VBMI-NEXT:    movq %rsi, 48(%rax)
-; AVX512VBMI-NEXT:    movq %r8, 56(%rax)
-; AVX512VBMI-NEXT:    addq $8, %rsp
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
+; AVX512VL-NEXT:    movl $1, %ecx
+; AVX512VL-NEXT:    shlxq %rsi, %rcx, %rdx
+; AVX512VL-NEXT:    shrl $6, %esi
+; AVX512VL-NEXT:    shlxq %rsi, %rcx, %rcx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpbroadcastq %rdx, %zmm1 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VL-NEXT:    kmovd %k0, %ecx
+; AVX512VL-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512VL-NEXT:    kmovb %k0, %edx
+; AVX512VL-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT:    xorl %edx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VL-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bzhi_i512_vector:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movl $1, %ecx
+; AVX512VBMI-NEXT:    shlxq %rsi, %rcx, %rdx
+; AVX512VBMI-NEXT:    shrl $6, %esi
+; AVX512VBMI-NEXT:    shlxq %rsi, %rcx, %rcx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpbroadcastq %rdx, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VBMI-NEXT:    vpcmpltuq %zmm1, %zmm2, %k0
+; AVX512VBMI-NEXT:    kmovd %k0, %ecx
+; AVX512VBMI-NEXT:    vptestnmq %zmm1, %zmm1, %k0
+; AVX512VBMI-NEXT:    kmovb %k0, %edx
+; AVX512VBMI-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512VBMI-NEXT:    xorl %edx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm1, %zmm2 {%k1}
+; AVX512VBMI-NEXT:    vpandq %zmm2, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = bitcast <8 x i64> %v0 to i512
@@ -4740,190 +4143,73 @@ define i512 @bzhi_i512_load(ptr %p0, i512 %idx) nounwind {
 ;
 ; AVX512F-LABEL: bzhi_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %edx, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %edx
-; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    negl %edx
-; AVX512F-NEXT:    movslq %edx, %r14
-; AVX512F-NEXT:    movq -16(%rsp,%r14), %rdx
-; AVX512F-NEXT:    movq -8(%rsp,%r14), %r8
-; AVX512F-NEXT:    shldq %cl, %rdx, %r8
-; AVX512F-NEXT:    movq -24(%rsp,%r14), %r9
-; AVX512F-NEXT:    shldq %cl, %r9, %rdx
-; AVX512F-NEXT:    movq -32(%rsp,%r14), %r10
-; AVX512F-NEXT:    shldq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -40(%rsp,%r14), %r11
-; AVX512F-NEXT:    shldq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -48(%rsp,%r14), %rbx
-; AVX512F-NEXT:    shldq %cl, %rbx, %r11
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq -64(%rsp,%r14), %r15
-; AVX512F-NEXT:    movq -56(%rsp,%r14), %rdi
-; AVX512F-NEXT:    shldq %cl, %rdi, %rbx
-; AVX512F-NEXT:    shldq %cl, %r15, %rdi
-; AVX512F-NEXT:    shlxq %rcx, %r15, %rcx
-; AVX512F-NEXT:    addq $-1, %rcx
-; AVX512F-NEXT:    adcq $-1, %rdi
-; AVX512F-NEXT:    adcq $-1, %rbx
-; AVX512F-NEXT:    adcq $-1, %r11
-; AVX512F-NEXT:    adcq $-1, %r10
-; AVX512F-NEXT:    adcq $-1, %r9
-; AVX512F-NEXT:    adcq $-1, %rdx
-; AVX512F-NEXT:    adcq $-1, %r8
-; AVX512F-NEXT:    andq 56(%rsi), %r8
-; AVX512F-NEXT:    andq 48(%rsi), %rdx
-; AVX512F-NEXT:    andq 40(%rsi), %r9
-; AVX512F-NEXT:    andq 32(%rsi), %r10
-; AVX512F-NEXT:    andq 24(%rsi), %r11
-; AVX512F-NEXT:    andq 16(%rsi), %rbx
-; AVX512F-NEXT:    andq 8(%rsi), %rdi
-; AVX512F-NEXT:    andq (%rsi), %rcx
-; AVX512F-NEXT:    movq %rcx, (%rax)
-; AVX512F-NEXT:    movq %rdi, 8(%rax)
-; AVX512F-NEXT:    movq %rbx, 16(%rax)
-; AVX512F-NEXT:    movq %r11, 24(%rax)
-; AVX512F-NEXT:    movq %r10, 32(%rax)
-; AVX512F-NEXT:    movq %r9, 40(%rax)
-; AVX512F-NEXT:    movq %rdx, 48(%rax)
-; AVX512F-NEXT:    movq %r8, 56(%rax)
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    movl $1, %ecx
+; AVX512F-NEXT:    shlxq %rdx, %rcx, %rdi
+; AVX512F-NEXT:    shrl $6, %edx
+; AVX512F-NEXT:    shlxq %rdx, %rcx, %rcx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    movzbl %dl, %edx
+; AVX512F-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT:    xorl %edx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vpandq (%rsi), %zmm1, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: bzhi_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %edx, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %edx
-; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    negl %edx
-; AVX512VL-NEXT:    movslq %edx, %rax
-; AVX512VL-NEXT:    movq -16(%rsp,%rax), %rdx
-; AVX512VL-NEXT:    movq -8(%rsp,%rax), %r8
-; AVX512VL-NEXT:    shldq %cl, %rdx, %r8
-; AVX512VL-NEXT:    movq -24(%rsp,%rax), %r9
-; AVX512VL-NEXT:    shldq %cl, %r9, %rdx
-; AVX512VL-NEXT:    movq -32(%rsp,%rax), %r10
-; AVX512VL-NEXT:    shldq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -40(%rsp,%rax), %r11
-; AVX512VL-NEXT:    shldq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -48(%rsp,%rax), %rbx
-; AVX512VL-NEXT:    shldq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -56(%rsp,%rax), %r14
-; AVX512VL-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -64(%rsp,%rax), %r15
-; AVX512VL-NEXT:    shldq %cl, %r15, %r14
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    shlxq %rcx, %r15, %rcx
-; AVX512VL-NEXT:    addq $-1, %rcx
-; AVX512VL-NEXT:    adcq $-1, %r14
-; AVX512VL-NEXT:    adcq $-1, %rbx
-; AVX512VL-NEXT:    adcq $-1, %r11
-; AVX512VL-NEXT:    adcq $-1, %r10
-; AVX512VL-NEXT:    adcq $-1, %r9
-; AVX512VL-NEXT:    adcq $-1, %rdx
-; AVX512VL-NEXT:    adcq $-1, %r8
-; AVX512VL-NEXT:    andq 56(%rsi), %r8
-; AVX512VL-NEXT:    andq 48(%rsi), %rdx
-; AVX512VL-NEXT:    andq 40(%rsi), %r9
-; AVX512VL-NEXT:    andq 32(%rsi), %r10
-; AVX512VL-NEXT:    andq 24(%rsi), %r11
-; AVX512VL-NEXT:    andq 16(%rsi), %rbx
-; AVX512VL-NEXT:    andq 8(%rsi), %r14
-; AVX512VL-NEXT:    andq (%rsi), %rcx
-; AVX512VL-NEXT:    movq %rcx, (%rdi)
-; AVX512VL-NEXT:    movq %r14, 8(%rdi)
-; AVX512VL-NEXT:    movq %rbx, 16(%rdi)
-; AVX512VL-NEXT:    movq %r11, 24(%rdi)
-; AVX512VL-NEXT:    movq %r10, 32(%rdi)
-; AVX512VL-NEXT:    movq %r9, 40(%rdi)
-; AVX512VL-NEXT:    movq %rdx, 48(%rdi)
-; AVX512VL-NEXT:    movq %r8, 56(%rdi)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    movl $1, %ecx
+; AVX512VL-NEXT:    shlxq %rdx, %rcx, %rdi
+; AVX512VL-NEXT:    shrl $6, %edx
+; AVX512VL-NEXT:    shlxq %rdx, %rcx, %rcx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT:    kmovd %k0, %ecx
+; AVX512VL-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    kmovb %k0, %edx
+; AVX512VL-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT:    xorl %edx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vpandq (%rsi), %zmm1, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: bzhi_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %edx, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %edx
-; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    negl %edx
-; AVX512VBMI-NEXT:    movslq %edx, %rax
-; AVX512VBMI-NEXT:    movq -16(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT:    movq -8(%rsp,%rax), %r8
-; AVX512VBMI-NEXT:    shldq %cl, %rdx, %r8
-; AVX512VBMI-NEXT:    movq -24(%rsp,%rax), %r9
-; AVX512VBMI-NEXT:    shldq %cl, %r9, %rdx
-; AVX512VBMI-NEXT:    movq -32(%rsp,%rax), %r10
-; AVX512VBMI-NEXT:    shldq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -40(%rsp,%rax), %r11
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -48(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT:    shldq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -56(%rsp,%rax), %r14
-; AVX512VBMI-NEXT:    shldq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -64(%rsp,%rax), %r15
-; AVX512VBMI-NEXT:    shldq %cl, %r15, %r14
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    shlxq %rcx, %r15, %rcx
-; AVX512VBMI-NEXT:    addq $-1, %rcx
-; AVX512VBMI-NEXT:    adcq $-1, %r14
-; AVX512VBMI-NEXT:    adcq $-1, %rbx
-; AVX512VBMI-NEXT:    adcq $-1, %r11
-; AVX512VBMI-NEXT:    adcq $-1, %r10
-; AVX512VBMI-NEXT:    adcq $-1, %r9
-; AVX512VBMI-NEXT:    adcq $-1, %rdx
-; AVX512VBMI-NEXT:    adcq $-1, %r8
-; AVX512VBMI-NEXT:    andq 56(%rsi), %r8
-; AVX512VBMI-NEXT:    andq 48(%rsi), %rdx
-; AVX512VBMI-NEXT:    andq 40(%rsi), %r9
-; AVX512VBMI-NEXT:    andq 32(%rsi), %r10
-; AVX512VBMI-NEXT:    andq 24(%rsi), %r11
-; AVX512VBMI-NEXT:    andq 16(%rsi), %rbx
-; AVX512VBMI-NEXT:    andq 8(%rsi), %r14
-; AVX512VBMI-NEXT:    andq (%rsi), %rcx
-; AVX512VBMI-NEXT:    movq %rcx, (%rdi)
-; AVX512VBMI-NEXT:    movq %r14, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %rbx, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %r11, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT:    movq %r9, 40(%rdi)
-; AVX512VBMI-NEXT:    movq %rdx, 48(%rdi)
-; AVX512VBMI-NEXT:    movq %r8, 56(%rdi)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    movl $1, %ecx
+; AVX512VBMI-NEXT:    shlxq %rdx, %rcx, %rdi
+; AVX512VBMI-NEXT:    shrl $6, %edx
+; AVX512VBMI-NEXT:    shlxq %rdx, %rcx, %rcx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VBMI-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VBMI-NEXT:    kmovd %k0, %ecx
+; AVX512VBMI-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT:    kmovb %k0, %edx
+; AVX512VBMI-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512VBMI-NEXT:    xorl %edx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT:    vpandq (%rsi), %zmm1, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -5183,120 +4469,81 @@ define i512 @isolate_msb_i512(i512 %a0, i512 %idx) nounwind {
 ;
 ; AVX512F-LABEL: isolate_msb_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %r13
-; AVX512F-NEXT:    pushq %r12
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    subq $24, %rsp
-; AVX512F-NEXT:    movq %r8, %r12
-; AVX512F-NEXT:    movq %rcx, %r8
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    vmovq %rdx, %xmm0
-; AVX512F-NEXT:    vmovq %r12, %xmm1
+; AVX512F-NEXT:    orq %rax, %rdx
+; AVX512F-NEXT:    vmovq %r8, %xmm1
+; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %r8
+; AVX512F-NEXT:    orq %rdx, %r8
 ; AVX512F-NEXT:    vmovq %rsi, %xmm2
+; AVX512F-NEXT:    orq %r9, %rsi
 ; AVX512F-NEXT:    vmovq %rcx, %xmm3
+; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT:    orq %rsi, %rcx
+; AVX512F-NEXT:    xorl %edx, %edx
+; AVX512F-NEXT:    orq %r8, %rcx
+; AVX512F-NEXT:    sete %dl
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512F-NEXT:    vmovq %r9, %xmm2
-; AVX512F-NEXT:    vmovq %r10, %xmm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512F-NEXT:    vmovq %r9, %xmm1
+; AVX512F-NEXT:    vmovq %rax, %xmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
 ; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
 ; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovd %xmm0, %r13d
-; AVX512F-NEXT:    movl %r13d, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %r13d
-; AVX512F-NEXT:    andl $56, %r13d
-; AVX512F-NEXT:    movq -56(%rsp,%r13), %rax
-; AVX512F-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    movq -64(%rsp,%r13), %r14
-; AVX512F-NEXT:    movq %r14, %r11
-; AVX512F-NEXT:    shrdq %cl, %rax, %r11
-; AVX512F-NEXT:    movq -72(%rsp,%r13), %r15
-; AVX512F-NEXT:    movq %r15, %rbx
-; AVX512F-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512F-NEXT:    movq -80(%rsp,%r13), %rbp
-; AVX512F-NEXT:    movq %rbp, %r14
-; AVX512F-NEXT:    shrdq %cl, %r15, %r14
-; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %r12
-; AVX512F-NEXT:    movq -88(%rsp,%r13), %rax
-; AVX512F-NEXT:    movq %rax, %r15
-; AVX512F-NEXT:    shrdq %cl, %rbp, %r15
-; AVX512F-NEXT:    orq %r10, %rdx
-; AVX512F-NEXT:    orq %rdx, %r12
-; AVX512F-NEXT:    orq %r9, %rsi
-; AVX512F-NEXT:    movq -96(%rsp,%r13), %rbp
-; AVX512F-NEXT:    movq %rbp, %rdx
-; AVX512F-NEXT:    shrdq %cl, %rax, %rdx
-; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %r8
-; AVX512F-NEXT:    movq -112(%rsp,%r13), %r9
-; AVX512F-NEXT:    movq -104(%rsp,%r13), %rax
-; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    shrdq %cl, %rbp, %r10
-; AVX512F-NEXT:    shrdq %cl, %rax, %r9
-; AVX512F-NEXT:    orq %rsi, %r8
-; AVX512F-NEXT:    xorl %eax, %eax
-; AVX512F-NEXT:    orq %r12, %r8
-; AVX512F-NEXT:    shrxq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX512F-NEXT:    cmoveq %rax, %r10
-; AVX512F-NEXT:    cmoveq %rax, %rdx
-; AVX512F-NEXT:    cmoveq %rax, %r15
-; AVX512F-NEXT:    cmoveq %rax, %r14
-; AVX512F-NEXT:    cmoveq %rax, %rbx
-; AVX512F-NEXT:    cmoveq %rax, %r11
-; AVX512F-NEXT:    cmoveq %rax, %r9
-; AVX512F-NEXT:    cmoveq %rax, %rcx
-; AVX512F-NEXT:    movq %rcx, 56(%rdi)
-; AVX512F-NEXT:    movq %r11, 48(%rdi)
-; AVX512F-NEXT:    movq %rbx, 40(%rdi)
-; AVX512F-NEXT:    movq %r14, 32(%rdi)
-; AVX512F-NEXT:    movq %r15, 24(%rdi)
-; AVX512F-NEXT:    movq %rdx, 16(%rdi)
-; AVX512F-NEXT:    movq %r10, 8(%rdi)
-; AVX512F-NEXT:    movq %r9, (%rdi)
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    movl %eax, %ecx
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    shrl $6, %eax
+; AVX512F-NEXT:    movl $-1, %ecx
+; AVX512F-NEXT:    shlxl %eax, %ecx, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    addq $24, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r12
-; AVX512F-NEXT:    popq %r13
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    negl %edx
+; AVX512F-NEXT:    kmovw %edx, %k0
+; AVX512F-NEXT:    knotw %k0, %k1
+; AVX512F-NEXT:    vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: isolate_msb_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %r13
-; AVX512VL-NEXT:    pushq %r12
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    movq %rcx, %rax
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    vmovq %r8, %xmm0
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT:    vmovq %rdx, %xmm0
+; AVX512VL-NEXT:    vmovq %r8, %xmm1
 ; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %r8
-; AVX512VL-NEXT:    vmovq %rdx, %xmm1
+; AVX512VL-NEXT:    orq %rax, %rdx
+; AVX512VL-NEXT:    orq %rdx, %r8
 ; AVX512VL-NEXT:    vmovq %rsi, %xmm2
 ; AVX512VL-NEXT:    vmovq %rcx, %xmm3
-; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %rax
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %rcx
+; AVX512VL-NEXT:    orq %r9, %rsi
+; AVX512VL-NEXT:    orq %rsi, %rcx
+; AVX512VL-NEXT:    xorl %edx, %edx
+; AVX512VL-NEXT:    orq %r8, %rcx
+; AVX512VL-NEXT:    sete %dl
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
 ; AVX512VL-NEXT:    vmovq %r9, %xmm2
-; AVX512VL-NEXT:    vmovq %r10, %xmm3
+; AVX512VL-NEXT:    vmovq %rax, %xmm3
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -5304,159 +4551,78 @@ define i512 @isolate_msb_i512(i512 %a0, i512 %idx) nounwind {
 ; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT:    shrl $6, %eax
+; AVX512VL-NEXT:    movl $-1, %ecx
+; AVX512VL-NEXT:    shlxl %eax, %ecx, %eax
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovd %xmm0, %r14d
-; AVX512VL-NEXT:    movl %r14d, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %r14d
-; AVX512VL-NEXT:    andl $56, %r14d
-; AVX512VL-NEXT:    movq -72(%rsp,%r14), %r15
-; AVX512VL-NEXT:    movq -80(%rsp,%r14), %r12
-; AVX512VL-NEXT:    movq %r12, %r11
-; AVX512VL-NEXT:    shrdq %cl, %r15, %r11
-; AVX512VL-NEXT:    movq -88(%rsp,%r14), %r13
-; AVX512VL-NEXT:    movq %r13, %rbx
-; AVX512VL-NEXT:    shrdq %cl, %r12, %rbx
-; AVX512VL-NEXT:    orq %r10, %rdx
-; AVX512VL-NEXT:    movq -96(%rsp,%r14), %r12
-; AVX512VL-NEXT:    movq %r12, %r10
-; AVX512VL-NEXT:    shrdq %cl, %r13, %r10
-; AVX512VL-NEXT:    orq %rdx, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%r14), %r13
-; AVX512VL-NEXT:    movq %r13, %rdx
-; AVX512VL-NEXT:    shrdq %cl, %r12, %rdx
-; AVX512VL-NEXT:    orq %r9, %rsi
-; AVX512VL-NEXT:    movq -112(%rsp,%r14), %r12
-; AVX512VL-NEXT:    movq %r12, %r9
-; AVX512VL-NEXT:    shrdq %cl, %r13, %r9
-; AVX512VL-NEXT:    orq %rsi, %rax
-; AVX512VL-NEXT:    movq -120(%rsp,%r14), %r13
-; AVX512VL-NEXT:    movq %r13, %rsi
-; AVX512VL-NEXT:    shrdq %cl, %r12, %rsi
-; AVX512VL-NEXT:    movq -128(%rsp,%r14), %r14
-; AVX512VL-NEXT:    shrdq %cl, %r13, %r14
-; AVX512VL-NEXT:    xorl %r12d, %r12d
-; AVX512VL-NEXT:    orq %r8, %rax
-; AVX512VL-NEXT:    shrxq %rcx, %r15, %rax
-; AVX512VL-NEXT:    cmoveq %r12, %rsi
-; AVX512VL-NEXT:    cmoveq %r12, %r9
-; AVX512VL-NEXT:    cmoveq %r12, %rdx
-; AVX512VL-NEXT:    cmoveq %r12, %r10
-; AVX512VL-NEXT:    cmoveq %r12, %rbx
-; AVX512VL-NEXT:    cmoveq %r12, %r11
-; AVX512VL-NEXT:    cmoveq %r12, %r14
-; AVX512VL-NEXT:    cmoveq %r12, %rax
-; AVX512VL-NEXT:    movq %rax, 56(%rdi)
-; AVX512VL-NEXT:    movq %r11, 48(%rdi)
-; AVX512VL-NEXT:    movq %rbx, 40(%rdi)
-; AVX512VL-NEXT:    movq %r10, 32(%rdi)
-; AVX512VL-NEXT:    movq %rdx, 24(%rdi)
-; AVX512VL-NEXT:    movq %r9, 16(%rdi)
-; AVX512VL-NEXT:    movq %rsi, 8(%rdi)
-; AVX512VL-NEXT:    movq %r14, (%rdi)
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r12
-; AVX512VL-NEXT:    popq %r13
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    negl %edx
+; AVX512VL-NEXT:    kmovd %edx, %k0
+; AVX512VL-NEXT:    knotw %k0, %k1
+; AVX512VL-NEXT:    vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: isolate_msb_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %r13
-; AVX512VBMI-NEXT:    pushq %r12
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    movq %rcx, %rax
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    vmovq %r8, %xmm0
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT:    vmovq %rdx, %xmm0
+; AVX512VBMI-NEXT:    orq %rdi, %rdx
+; AVX512VBMI-NEXT:    vmovq %r8, %xmm1
 ; AVX512VBMI-NEXT:    orq {{[0-9]+}}(%rsp), %r8
-; AVX512VBMI-NEXT:    vmovq %rdx, %xmm1
+; AVX512VBMI-NEXT:    orq %rdx, %r8
 ; AVX512VBMI-NEXT:    vmovq %rsi, %xmm2
+; AVX512VBMI-NEXT:    orq %r9, %rsi
 ; AVX512VBMI-NEXT:    vmovq %rcx, %xmm3
-; AVX512VBMI-NEXT:    orq {{[0-9]+}}(%rsp), %rax
-; AVX512VBMI-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX512VBMI-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
-; AVX512VBMI-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512VBMI-NEXT:    vmovq %r9, %xmm2
-; AVX512VBMI-NEXT:    vmovq %r10, %xmm3
-; AVX512VBMI-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VBMI-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512VBMI-NEXT:    orq {{[0-9]+}}(%rsp), %rcx
+; AVX512VBMI-NEXT:    orq %rsi, %rcx
+; AVX512VBMI-NEXT:    xorl %edx, %edx
+; AVX512VBMI-NEXT:    orq %r8, %rcx
+; AVX512VBMI-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX512VBMI-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
+; AVX512VBMI-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512VBMI-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VBMI-NEXT:    vmovq %r9, %xmm1
+; AVX512VBMI-NEXT:    vmovq %rdi, %xmm3
+; AVX512VBMI-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512VBMI-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
 ; AVX512VBMI-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VBMI-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VBMI-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
 ; AVX512VBMI-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VBMI-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovd %xmm0, %r14d
-; AVX512VBMI-NEXT:    movl %r14d, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %r14d
-; AVX512VBMI-NEXT:    andl $56, %r14d
-; AVX512VBMI-NEXT:    movq -72(%rsp,%r14), %r15
-; AVX512VBMI-NEXT:    movq -80(%rsp,%r14), %r12
-; AVX512VBMI-NEXT:    movq %r12, %r11
-; AVX512VBMI-NEXT:    shrdq %cl, %r15, %r11
-; AVX512VBMI-NEXT:    movq -88(%rsp,%r14), %r13
-; AVX512VBMI-NEXT:    movq %r13, %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %r12, %rbx
-; AVX512VBMI-NEXT:    orq %r10, %rdx
-; AVX512VBMI-NEXT:    movq -96(%rsp,%r14), %r12
-; AVX512VBMI-NEXT:    movq %r12, %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r13, %r10
-; AVX512VBMI-NEXT:    orq %rdx, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%r14), %r13
-; AVX512VBMI-NEXT:    movq %r13, %rdx
-; AVX512VBMI-NEXT:    shrdq %cl, %r12, %rdx
-; AVX512VBMI-NEXT:    orq %r9, %rsi
-; AVX512VBMI-NEXT:    movq -112(%rsp,%r14), %r12
-; AVX512VBMI-NEXT:    movq %r12, %r9
-; AVX512VBMI-NEXT:    shrdq %cl, %r13, %r9
-; AVX512VBMI-NEXT:    orq %rsi, %rax
-; AVX512VBMI-NEXT:    movq -120(%rsp,%r14), %r13
-; AVX512VBMI-NEXT:    movq %r13, %rsi
-; AVX512VBMI-NEXT:    shrdq %cl, %r12, %rsi
-; AVX512VBMI-NEXT:    movq -128(%rsp,%r14), %r14
-; AVX512VBMI-NEXT:    shrdq %cl, %r13, %r14
-; AVX512VBMI-NEXT:    xorl %r12d, %r12d
-; AVX512VBMI-NEXT:    orq %r8, %rax
-; AVX512VBMI-NEXT:    shrxq %rcx, %r15, %rax
-; AVX512VBMI-NEXT:    cmoveq %r12, %rsi
-; AVX512VBMI-NEXT:    cmoveq %r12, %r9
-; AVX512VBMI-NEXT:    cmoveq %r12, %rdx
-; AVX512VBMI-NEXT:    cmoveq %r12, %r10
-; AVX512VBMI-NEXT:    cmoveq %r12, %rbx
-; AVX512VBMI-NEXT:    cmoveq %r12, %r11
-; AVX512VBMI-NEXT:    cmoveq %r12, %r14
-; AVX512VBMI-NEXT:    cmoveq %r12, %rax
-; AVX512VBMI-NEXT:    movq %rax, 56(%rdi)
-; AVX512VBMI-NEXT:    movq %r11, 48(%rdi)
-; AVX512VBMI-NEXT:    movq %rbx, 40(%rdi)
-; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT:    movq %rdx, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %r9, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %r14, (%rdi)
-; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r12
-; AVX512VBMI-NEXT:    popq %r13
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    sete %dl
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rcx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    movl $-1, %esi
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT:    vpbroadcastq %xmm0, %zmm0
+; AVX512VBMI-NEXT:    vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT:    negl %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k0
+; AVX512VBMI-NEXT:    knotw %k0, %k1
+; AVX512VBMI-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %eqz = icmp eq i512 %a0, 0
@@ -5828,208 +4994,109 @@ define i512 @isolate_msb_i512_vector(<8 x i64> %v0, i512 %idx) nounwind {
 ;
 ; AVX512F-LABEL: isolate_msb_i512_vector:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm2 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT:    vmovups %zmm2, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm1
-; AVX512F-NEXT:    vplzcntq %zmm1, %zmm2
-; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovd %xmm1, %ebx
-; AVX512F-NEXT:    movl %ebx, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %ebx
-; AVX512F-NEXT:    andl $56, %ebx
-; AVX512F-NEXT:    movq -72(%rsp,%rbx), %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rbx), %rax
-; AVX512F-NEXT:    movq %rax, %rdx
-; AVX512F-NEXT:    shrdq %cl, %r10, %rdx
-; AVX512F-NEXT:    movq -88(%rsp,%rbx), %r9
-; AVX512F-NEXT:    movq %r9, %rsi
-; AVX512F-NEXT:    shrdq %cl, %rax, %rsi
-; AVX512F-NEXT:    movq -96(%rsp,%rbx), %rax
-; AVX512F-NEXT:    movq %rax, %r8
-; AVX512F-NEXT:    shrdq %cl, %r9, %r8
-; AVX512F-NEXT:    movq -104(%rsp,%rbx), %r14
-; AVX512F-NEXT:    movq %r14, %r9
-; AVX512F-NEXT:    shrdq %cl, %rax, %r9
-; AVX512F-NEXT:    movq -112(%rsp,%rbx), %r15
-; AVX512F-NEXT:    movq %r15, %r11
-; AVX512F-NEXT:    shrdq %cl, %r14, %r11
-; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq -128(%rsp,%rbx), %rdi
-; AVX512F-NEXT:    movq -120(%rsp,%rbx), %r14
-; AVX512F-NEXT:    movq %r14, %rbx
-; AVX512F-NEXT:    shrdq %cl, %r15, %rbx
-; AVX512F-NEXT:    shrdq %cl, %r14, %rdi
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    xorl %r14d, %r14d
+; AVX512F-NEXT:    xorl %ecx, %ecx
 ; AVX512F-NEXT:    kortestw %k0, %k0
-; AVX512F-NEXT:    shrxq %rcx, %r10, %rcx
-; AVX512F-NEXT:    cmoveq %r14, %rbx
-; AVX512F-NEXT:    cmoveq %r14, %r11
-; AVX512F-NEXT:    cmoveq %r14, %r9
-; AVX512F-NEXT:    cmoveq %r14, %r8
-; AVX512F-NEXT:    cmoveq %r14, %rsi
-; AVX512F-NEXT:    cmoveq %r14, %rdx
-; AVX512F-NEXT:    cmoveq %r14, %rdi
-; AVX512F-NEXT:    cmoveq %r14, %rcx
-; AVX512F-NEXT:    movq %rcx, 56(%rax)
-; AVX512F-NEXT:    movq %rdx, 48(%rax)
-; AVX512F-NEXT:    movq %rsi, 40(%rax)
-; AVX512F-NEXT:    movq %r8, 32(%rax)
-; AVX512F-NEXT:    movq %r9, 24(%rax)
-; AVX512F-NEXT:    movq %r11, 16(%rax)
-; AVX512F-NEXT:    movq %rbx, 8(%rax)
-; AVX512F-NEXT:    movq %rdi, (%rax)
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    sete %cl
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    movl %eax, %edx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    shrl $6, %eax
+; AVX512F-NEXT:    movl $-1, %edx
+; AVX512F-NEXT:    shlxl %eax, %edx, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    negl %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k0
+; AVX512F-NEXT:    knotw %k0, %k1
+; AVX512F-NEXT:    vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: isolate_msb_i512_vector:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT:    vpermq %zmm0, %zmm2, %zmm2
-; AVX512VL-NEXT:    vptestmq %zmm2, %zmm2, %k1
-; AVX512VL-NEXT:    vplzcntq %zmm2, %zmm2
-; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm2 {%k1}
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovd %xmm2, %r10d
-; AVX512VL-NEXT:    movl %r10d, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %r10d
-; AVX512VL-NEXT:    andl $56, %r10d
-; AVX512VL-NEXT:    movq -72(%rsp,%r10), %r11
-; AVX512VL-NEXT:    movq -80(%rsp,%r10), %rax
-; AVX512VL-NEXT:    movq %rax, %rdx
-; AVX512VL-NEXT:    shrdq %cl, %r11, %rdx
-; AVX512VL-NEXT:    movq -88(%rsp,%r10), %r9
-; AVX512VL-NEXT:    movq %r9, %rsi
-; AVX512VL-NEXT:    shrdq %cl, %rax, %rsi
-; AVX512VL-NEXT:    movq -96(%rsp,%r10), %rax
-; AVX512VL-NEXT:    movq %rax, %r8
-; AVX512VL-NEXT:    shrdq %cl, %r9, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%r10), %rbx
-; AVX512VL-NEXT:    movq %rbx, %r9
-; AVX512VL-NEXT:    shrdq %cl, %rax, %r9
-; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq -112(%rsp,%r10), %r14
-; AVX512VL-NEXT:    movq %r14, %rdi
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %rdi
 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512VL-NEXT:    movq -120(%rsp,%r10), %r15
-; AVX512VL-NEXT:    movq %r15, %rbx
-; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%r10), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r15, %r10
-; AVX512VL-NEXT:    xorl %r14d, %r14d
+; AVX512VL-NEXT:    xorl %ecx, %ecx
 ; AVX512VL-NEXT:    kortestw %k0, %k0
-; AVX512VL-NEXT:    shrxq %rcx, %r11, %rcx
-; AVX512VL-NEXT:    cmoveq %r14, %rbx
-; AVX512VL-NEXT:    cmoveq %r14, %rdi
-; AVX512VL-NEXT:    cmoveq %r14, %r9
-; AVX512VL-NEXT:    cmoveq %r14, %r8
-; AVX512VL-NEXT:    cmoveq %r14, %rsi
-; AVX512VL-NEXT:    cmoveq %r14, %rdx
-; AVX512VL-NEXT:    cmoveq %r14, %r10
-; AVX512VL-NEXT:    cmoveq %r14, %rcx
-; AVX512VL-NEXT:    movq %rcx, 56(%rax)
-; AVX512VL-NEXT:    movq %rdx, 48(%rax)
-; AVX512VL-NEXT:    movq %rsi, 40(%rax)
-; AVX512VL-NEXT:    movq %r8, 32(%rax)
-; AVX512VL-NEXT:    movq %r9, 24(%rax)
-; AVX512VL-NEXT:    movq %rdi, 16(%rax)
-; AVX512VL-NEXT:    movq %rbx, 8(%rax)
-; AVX512VL-NEXT:    movq %r10, (%rax)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    sete %cl
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    movl %eax, %edx
+; AVX512VL-NEXT:    vpbroadcastq %rdx, %xmm0
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT:    shrl $6, %eax
+; AVX512VL-NEXT:    movl $-1, %edx
+; AVX512VL-NEXT:    shlxl %eax, %edx, %eax
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    negl %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k0
+; AVX512VL-NEXT:    knotw %k0, %k1
+; AVX512VL-NEXT:    vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: isolate_msb_i512_vector:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
-; AVX512VBMI-NEXT:    vpermq %zmm0, %zmm2, %zmm2
-; AVX512VBMI-NEXT:    vptestmq %zmm2, %zmm2, %k1
-; AVX512VBMI-NEXT:    vplzcntq %zmm2, %zmm2
-; AVX512VBMI-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512VBMI-NEXT:    vpcompressq %zmm2, %zmm2 {%k1}
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovd %xmm2, %r10d
-; AVX512VBMI-NEXT:    movl %r10d, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %r10d
-; AVX512VBMI-NEXT:    andl $56, %r10d
-; AVX512VBMI-NEXT:    movq -72(%rsp,%r10), %r11
-; AVX512VBMI-NEXT:    movq -80(%rsp,%r10), %rax
-; AVX512VBMI-NEXT:    movq %rax, %rdx
-; AVX512VBMI-NEXT:    shrdq %cl, %r11, %rdx
-; AVX512VBMI-NEXT:    movq -88(%rsp,%r10), %r9
-; AVX512VBMI-NEXT:    movq %r9, %rsi
-; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rsi
-; AVX512VBMI-NEXT:    movq -96(%rsp,%r10), %rax
-; AVX512VBMI-NEXT:    movq %rax, %r8
-; AVX512VBMI-NEXT:    shrdq %cl, %r9, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%r10), %rbx
-; AVX512VBMI-NEXT:    movq %rbx, %r9
-; AVX512VBMI-NEXT:    shrdq %cl, %rax, %r9
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq -112(%rsp,%r10), %r14
-; AVX512VBMI-NEXT:    movq %r14, %rdi
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %rdi
 ; AVX512VBMI-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512VBMI-NEXT:    movq -120(%rsp,%r10), %r15
-; AVX512VBMI-NEXT:    movq %r15, %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%r10), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r15, %r10
-; AVX512VBMI-NEXT:    xorl %r14d, %r14d
+; AVX512VBMI-NEXT:    xorl %ecx, %ecx
 ; AVX512VBMI-NEXT:    kortestw %k0, %k0
-; AVX512VBMI-NEXT:    shrxq %rcx, %r11, %rcx
-; AVX512VBMI-NEXT:    cmoveq %r14, %rbx
-; AVX512VBMI-NEXT:    cmoveq %r14, %rdi
-; AVX512VBMI-NEXT:    cmoveq %r14, %r9
-; AVX512VBMI-NEXT:    cmoveq %r14, %r8
-; AVX512VBMI-NEXT:    cmoveq %r14, %rsi
-; AVX512VBMI-NEXT:    cmoveq %r14, %rdx
-; AVX512VBMI-NEXT:    cmoveq %r14, %r10
-; AVX512VBMI-NEXT:    cmoveq %r14, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, 56(%rax)
-; AVX512VBMI-NEXT:    movq %rdx, 48(%rax)
-; AVX512VBMI-NEXT:    movq %rsi, 40(%rax)
-; AVX512VBMI-NEXT:    movq %r8, 32(%rax)
-; AVX512VBMI-NEXT:    movq %r9, 24(%rax)
-; AVX512VBMI-NEXT:    movq %rdi, 16(%rax)
-; AVX512VBMI-NEXT:    movq %rbx, 8(%rax)
-; AVX512VBMI-NEXT:    movq %r10, (%rax)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VBMI-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VBMI-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    sete %cl
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rdx
+; AVX512VBMI-NEXT:    shrl $6, %edx
+; AVX512VBMI-NEXT:    movl $-1, %esi
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT:    shlxl %edx, %esi, %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT:    vpbroadcastq %xmm0, %zmm0
+; AVX512VBMI-NEXT:    vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT:    negl %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k0
+; AVX512VBMI-NEXT:    knotw %k0, %k1
+; AVX512VBMI-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = bitcast <8 x i64> %v0 to i512
@@ -6300,211 +5367,112 @@ define i512 @isolate_msb_i512_load(ptr %p0, i512 %idx) nounwind {
 ;
 ; AVX512F-LABEL: isolate_msb_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
 ; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm0
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm2 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT:    vmovups %zmm2, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm1
-; AVX512F-NEXT:    vplzcntq %zmm1, %zmm2
-; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512F-NEXT:    vmovd %xmm1, %ebx
-; AVX512F-NEXT:    movl %ebx, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %ebx
-; AVX512F-NEXT:    andl $56, %ebx
-; AVX512F-NEXT:    movq -72(%rsp,%rbx), %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rbx), %rax
-; AVX512F-NEXT:    movq %rax, %rdx
-; AVX512F-NEXT:    shrdq %cl, %r10, %rdx
-; AVX512F-NEXT:    movq -88(%rsp,%rbx), %r9
-; AVX512F-NEXT:    movq %r9, %rsi
-; AVX512F-NEXT:    shrdq %cl, %rax, %rsi
-; AVX512F-NEXT:    movq -96(%rsp,%rbx), %rax
-; AVX512F-NEXT:    movq %rax, %r8
-; AVX512F-NEXT:    shrdq %cl, %r9, %r8
-; AVX512F-NEXT:    movq -104(%rsp,%rbx), %r14
-; AVX512F-NEXT:    movq %r14, %r9
-; AVX512F-NEXT:    shrdq %cl, %rax, %r9
-; AVX512F-NEXT:    movq -112(%rsp,%rbx), %r15
-; AVX512F-NEXT:    movq %r15, %r11
-; AVX512F-NEXT:    shrdq %cl, %r14, %r11
-; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq -128(%rsp,%rbx), %rdi
-; AVX512F-NEXT:    movq -120(%rsp,%rbx), %r14
-; AVX512F-NEXT:    movq %r14, %rbx
-; AVX512F-NEXT:    shrdq %cl, %r15, %rbx
-; AVX512F-NEXT:    shrdq %cl, %r14, %rdi
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512F-NEXT:    xorl %r14d, %r14d
+; AVX512F-NEXT:    xorl %ecx, %ecx
 ; AVX512F-NEXT:    kortestw %k0, %k0
-; AVX512F-NEXT:    shrxq %rcx, %r10, %rcx
-; AVX512F-NEXT:    cmoveq %r14, %rbx
-; AVX512F-NEXT:    cmoveq %r14, %r11
-; AVX512F-NEXT:    cmoveq %r14, %r9
-; AVX512F-NEXT:    cmoveq %r14, %r8
-; AVX512F-NEXT:    cmoveq %r14, %rsi
-; AVX512F-NEXT:    cmoveq %r14, %rdx
-; AVX512F-NEXT:    cmoveq %r14, %rdi
-; AVX512F-NEXT:    cmoveq %r14, %rcx
-; AVX512F-NEXT:    movq %rcx, 56(%rax)
-; AVX512F-NEXT:    movq %rdx, 48(%rax)
-; AVX512F-NEXT:    movq %rsi, 40(%rax)
-; AVX512F-NEXT:    movq %r8, 32(%rax)
-; AVX512F-NEXT:    movq %r9, 24(%rax)
-; AVX512F-NEXT:    movq %r11, 16(%rax)
-; AVX512F-NEXT:    movq %rbx, 8(%rax)
-; AVX512F-NEXT:    movq %rdi, (%rax)
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    sete %cl
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512F-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    movl %eax, %edx
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    shrl $6, %eax
+; AVX512F-NEXT:    movl $-1, %edx
+; AVX512F-NEXT:    shlxl %eax, %edx, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; AVX512F-NEXT:    negl %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k0
+; AVX512F-NEXT:    knotw %k0, %k1
+; AVX512F-NEXT:    vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: isolate_msb_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
 ; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
-; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT:    vpermq %zmm0, %zmm2, %zmm2
-; AVX512VL-NEXT:    vptestmq %zmm2, %zmm2, %k1
-; AVX512VL-NEXT:    vplzcntq %zmm2, %zmm2
-; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm2 {%k1} {z}
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovd %xmm2, %r10d
-; AVX512VL-NEXT:    movl %r10d, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %r10d
-; AVX512VL-NEXT:    andl $56, %r10d
-; AVX512VL-NEXT:    movq -72(%rsp,%r10), %r11
-; AVX512VL-NEXT:    movq -80(%rsp,%r10), %rax
-; AVX512VL-NEXT:    movq %rax, %rdx
-; AVX512VL-NEXT:    shrdq %cl, %r11, %rdx
-; AVX512VL-NEXT:    movq -88(%rsp,%r10), %r9
-; AVX512VL-NEXT:    movq %r9, %rsi
-; AVX512VL-NEXT:    shrdq %cl, %rax, %rsi
-; AVX512VL-NEXT:    movq -96(%rsp,%r10), %rax
-; AVX512VL-NEXT:    movq %rax, %r8
-; AVX512VL-NEXT:    shrdq %cl, %r9, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%r10), %rbx
-; AVX512VL-NEXT:    movq %rbx, %r9
-; AVX512VL-NEXT:    shrdq %cl, %rax, %r9
-; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq -112(%rsp,%r10), %r14
-; AVX512VL-NEXT:    movq %r14, %rdi
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %rdi
 ; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512VL-NEXT:    movq -120(%rsp,%r10), %r15
-; AVX512VL-NEXT:    movq %r15, %rbx
-; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%r10), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r15, %r10
-; AVX512VL-NEXT:    xorl %r14d, %r14d
+; AVX512VL-NEXT:    xorl %ecx, %ecx
 ; AVX512VL-NEXT:    kortestw %k0, %k0
-; AVX512VL-NEXT:    shrxq %rcx, %r11, %rcx
-; AVX512VL-NEXT:    cmoveq %r14, %rbx
-; AVX512VL-NEXT:    cmoveq %r14, %rdi
-; AVX512VL-NEXT:    cmoveq %r14, %r9
-; AVX512VL-NEXT:    cmoveq %r14, %r8
-; AVX512VL-NEXT:    cmoveq %r14, %rsi
-; AVX512VL-NEXT:    cmoveq %r14, %rdx
-; AVX512VL-NEXT:    cmoveq %r14, %r10
-; AVX512VL-NEXT:    cmoveq %r14, %rcx
-; AVX512VL-NEXT:    movq %rcx, 56(%rax)
-; AVX512VL-NEXT:    movq %rdx, 48(%rax)
-; AVX512VL-NEXT:    movq %rsi, 40(%rax)
-; AVX512VL-NEXT:    movq %r8, 32(%rax)
-; AVX512VL-NEXT:    movq %r9, 24(%rax)
-; AVX512VL-NEXT:    movq %rdi, 16(%rax)
-; AVX512VL-NEXT:    movq %rbx, 8(%rax)
-; AVX512VL-NEXT:    movq %r10, (%rax)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    sete %cl
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
+; AVX512VL-NEXT:    movl %eax, %edx
+; AVX512VL-NEXT:    vpbroadcastq %rdx, %xmm0
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT:    shrl $6, %eax
+; AVX512VL-NEXT:    movl $-1, %edx
+; AVX512VL-NEXT:    shlxl %eax, %edx, %eax
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    negl %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k0
+; AVX512VL-NEXT:    knotw %k0, %k1
+; AVX512VL-NEXT:    vpord %zmm2, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: isolate_msb_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
-; AVX512VBMI-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
-; AVX512VBMI-NEXT:    vpermq %zmm0, %zmm2, %zmm2
-; AVX512VBMI-NEXT:    vptestmq %zmm2, %zmm2, %k1
-; AVX512VBMI-NEXT:    vplzcntq %zmm2, %zmm2
-; AVX512VBMI-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
-; AVX512VBMI-NEXT:    vpcompressq %zmm2, %zmm2 {%k1} {z}
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovd %xmm2, %r10d
-; AVX512VBMI-NEXT:    movl %r10d, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %r10d
-; AVX512VBMI-NEXT:    andl $56, %r10d
-; AVX512VBMI-NEXT:    movq -72(%rsp,%r10), %r11
-; AVX512VBMI-NEXT:    movq -80(%rsp,%r10), %rax
-; AVX512VBMI-NEXT:    movq %rax, %rdx
-; AVX512VBMI-NEXT:    shrdq %cl, %r11, %rdx
-; AVX512VBMI-NEXT:    movq -88(%rsp,%r10), %r9
-; AVX512VBMI-NEXT:    movq %r9, %rsi
-; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rsi
-; AVX512VBMI-NEXT:    movq -96(%rsp,%r10), %rax
-; AVX512VBMI-NEXT:    movq %rax, %r8
-; AVX512VBMI-NEXT:    shrdq %cl, %r9, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%r10), %rbx
-; AVX512VBMI-NEXT:    movq %rbx, %r9
-; AVX512VBMI-NEXT:    shrdq %cl, %rax, %r9
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq -112(%rsp,%r10), %r14
-; AVX512VBMI-NEXT:    movq %r14, %rdi
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %rdi
+; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
 ; AVX512VBMI-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; AVX512VBMI-NEXT:    movq -120(%rsp,%r10), %r15
-; AVX512VBMI-NEXT:    movq %r15, %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%r10), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r15, %r10
-; AVX512VBMI-NEXT:    xorl %r14d, %r14d
+; AVX512VBMI-NEXT:    xorl %ecx, %ecx
 ; AVX512VBMI-NEXT:    kortestw %k0, %k0
-; AVX512VBMI-NEXT:    shrxq %rcx, %r11, %rcx
-; AVX512VBMI-NEXT:    cmoveq %r14, %rbx
-; AVX512VBMI-NEXT:    cmoveq %r14, %rdi
-; AVX512VBMI-NEXT:    cmoveq %r14, %r9
-; AVX512VBMI-NEXT:    cmoveq %r14, %r8
-; AVX512VBMI-NEXT:    cmoveq %r14, %rsi
-; AVX512VBMI-NEXT:    cmoveq %r14, %rdx
-; AVX512VBMI-NEXT:    cmoveq %r14, %r10
-; AVX512VBMI-NEXT:    cmoveq %r14, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, 56(%rax)
-; AVX512VBMI-NEXT:    movq %rdx, 48(%rax)
-; AVX512VBMI-NEXT:    movq %rsi, 40(%rax)
-; AVX512VBMI-NEXT:    movq %r8, 32(%rax)
-; AVX512VBMI-NEXT:    movq %r9, 24(%rax)
-; AVX512VBMI-NEXT:    movq %rdi, 16(%rax)
-; AVX512VBMI-NEXT:    movq %rbx, 8(%rax)
-; AVX512VBMI-NEXT:    movq %r10, (%rax)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VBMI-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512VBMI-NEXT:    vplzcntq %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    sete %cl
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rdx
+; AVX512VBMI-NEXT:    shrl $6, %edx
+; AVX512VBMI-NEXT:    movl $-1, %esi
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT:    shlxl %edx, %esi, %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT:    vpbroadcastq %xmm0, %zmm0
+; AVX512VBMI-NEXT:    vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT:    negl %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k0
+; AVX512VBMI-NEXT:    knotw %k0, %k1
+; AVX512VBMI-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index f60585e978104..c8ac18d1d309a 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -133,68 +133,42 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: shl_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
 ; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovdqu %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %eax, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
+; AVX512F-NEXT:    vmovq %rcx, %xmm1
+; AVX512F-NEXT:    vpbroadcastq %xmm1, %xmm1
 ; AVX512F-NEXT:    shrl $3, %eax
 ; AVX512F-NEXT:    andl $56, %eax
 ; AVX512F-NEXT:    negl %eax
-; AVX512F-NEXT:    movslq %eax, %r8
-; AVX512F-NEXT:    movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT:    movq -48(%rsp,%r8), %rax
-; AVX512F-NEXT:    movq %rax, %rsi
-; AVX512F-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -40(%rsp,%r8), %r10
-; AVX512F-NEXT:    movq %r10, %r9
-; AVX512F-NEXT:    shldq %cl, %rax, %r9
-; AVX512F-NEXT:    movq -32(%rsp,%r8), %rax
-; AVX512F-NEXT:    movq %rax, %r11
-; AVX512F-NEXT:    shldq %cl, %r10, %r11
-; AVX512F-NEXT:    movq -24(%rsp,%r8), %r10
-; AVX512F-NEXT:    movq %r10, %rbx
-; AVX512F-NEXT:    shldq %cl, %rax, %rbx
-; AVX512F-NEXT:    movq -16(%rsp,%r8), %rax
-; AVX512F-NEXT:    movq %rax, %r14
-; AVX512F-NEXT:    shldq %cl, %r10, %r14
-; AVX512F-NEXT:    movq -8(%rsp,%r8), %r10
-; AVX512F-NEXT:    shldq %cl, %rax, %r10
+; AVX512F-NEXT:    cltq
+; AVX512F-NEXT:    vmovdqu64 -64(%rsp,%rax), %zmm2
+; AVX512F-NEXT:    vpsllq %xmm1, %zmm2, %zmm3
+; AVX512F-NEXT:    valignq {{.*#+}} zmm0 = zmm0[7],zmm2[0,1,2,3,4,5,6]
+; AVX512F-NEXT:    vpsrlq $1, %zmm0, %zmm0
+; AVX512F-NEXT:    vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq -64(%rsp,%r8), %rdi
-; AVX512F-NEXT:    shlxq %rcx, %rdi, %r8
-; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT:    shldq %cl, %rdi, %rdx
-; AVX512F-NEXT:    movq %r10, 56(%rax)
-; AVX512F-NEXT:    movq %r14, 48(%rax)
-; AVX512F-NEXT:    movq %rbx, 40(%rax)
-; AVX512F-NEXT:    movq %r11, 32(%rax)
-; AVX512F-NEXT:    movq %r9, 24(%rax)
-; AVX512F-NEXT:    movq %rsi, 16(%rax)
-; AVX512F-NEXT:    movq %rdx, 8(%rax)
-; AVX512F-NEXT:    movq %r8, (%rax)
-; AVX512F-NEXT:    addq $8, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT:    popq %rcx
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shl_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    pushq %rax
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
@@ -210,53 +184,31 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movl %eax, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm0
 ; AVX512VL-NEXT:    shrl $3, %eax
 ; AVX512VL-NEXT:    andl $56, %eax
 ; AVX512VL-NEXT:    negl %eax
-; AVX512VL-NEXT:    movslq %eax, %r9
-; AVX512VL-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT:    movq -48(%rsp,%r9), %rax
-; AVX512VL-NEXT:    movq %rax, %rsi
-; AVX512VL-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq -40(%rsp,%r9), %r10
-; AVX512VL-NEXT:    movq %r10, %r8
-; AVX512VL-NEXT:    shldq %cl, %rax, %r8
-; AVX512VL-NEXT:    movq -32(%rsp,%r9), %r11
-; AVX512VL-NEXT:    movq %r11, %rbx
-; AVX512VL-NEXT:    shldq %cl, %r10, %rbx
+; AVX512VL-NEXT:    cltq
+; AVX512VL-NEXT:    vmovdqu64 -64(%rsp,%rax), %zmm1
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm3[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VL-NEXT:    vpsrlq $1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq -24(%rsp,%r9), %rdi
-; AVX512VL-NEXT:    movq %rdi, %r10
-; AVX512VL-NEXT:    shldq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT:    movq -16(%rsp,%r9), %r14
-; AVX512VL-NEXT:    movq %r14, %r15
-; AVX512VL-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VL-NEXT:    movq -8(%rsp,%r9), %rdi
-; AVX512VL-NEXT:    shldq %cl, %r14, %rdi
-; AVX512VL-NEXT:    shlxq %rcx, %r11, %r9
-; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT:    shldq %cl, %r11, %rdx
-; AVX512VL-NEXT:    movq %rdi, 56(%rax)
-; AVX512VL-NEXT:    movq %r15, 48(%rax)
-; AVX512VL-NEXT:    movq %r10, 40(%rax)
-; AVX512VL-NEXT:    movq %rbx, 32(%rax)
-; AVX512VL-NEXT:    movq %r8, 24(%rax)
-; AVX512VL-NEXT:    movq %rsi, 16(%rax)
-; AVX512VL-NEXT:    movq %rdx, 8(%rax)
-; AVX512VL-NEXT:    movq %r9, (%rax)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT:    popq %rcx
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shl_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT:    pushq %rax
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
 ; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
@@ -265,50 +217,23 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %eax, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %eax
-; AVX512VBMI-NEXT:    andl $56, %eax
-; AVX512VBMI-NEXT:    negl %eax
-; AVX512VBMI-NEXT:    movslq %eax, %r9
-; AVX512VBMI-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT:    movq -48(%rsp,%r9), %rax
-; AVX512VBMI-NEXT:    movq %rax, %rsi
-; AVX512VBMI-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq -40(%rsp,%r9), %r10
-; AVX512VBMI-NEXT:    movq %r10, %r8
-; AVX512VBMI-NEXT:    shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT:    movq -32(%rsp,%r9), %r11
-; AVX512VBMI-NEXT:    movq %r11, %rbx
-; AVX512VBMI-NEXT:    shldq %cl, %r10, %rbx
-; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq -24(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT:    movq %rdi, %r10
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT:    movq -16(%rsp,%r9), %r14
-; AVX512VBMI-NEXT:    movq %r14, %r15
-; AVX512VBMI-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT:    movq -8(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT:    shldq %cl, %r14, %rdi
-; AVX512VBMI-NEXT:    shlxq %rcx, %r11, %r9
-; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT:    movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT:    movq %r15, 48(%rax)
-; AVX512VBMI-NEXT:    movq %r10, 40(%rax)
-; AVX512VBMI-NEXT:    movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT:    movq %r8, 24(%rax)
-; AVX512VBMI-NEXT:    movq %rsi, 16(%rax)
-; AVX512VBMI-NEXT:    movq %rdx, 8(%rax)
-; AVX512VBMI-NEXT:    movq %r9, (%rax)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT:    movl %edi, %ecx
+; AVX512VBMI-NEXT:    shrl $3, %ecx
+; AVX512VBMI-NEXT:    andl $56, %ecx
+; AVX512VBMI-NEXT:    negl %ecx
+; AVX512VBMI-NEXT:    movslq %ecx, %rcx
+; AVX512VBMI-NEXT:    vmovdqu64 -64(%rsp,%rcx), %zmm1
+; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm2[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT:    vpshldvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm1, 32(%rax)
+; AVX512VBMI-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX512VBMI-NEXT:    popq %rcx
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %r = shl i512 %a0, %a1
@@ -428,16 +353,14 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: lshr_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    pushq %rax
 ; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
@@ -445,44 +368,25 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %eax, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512F-NEXT:    shrl $3, %eax
 ; AVX512F-NEXT:    andl $56, %eax
-; AVX512F-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT:    movq -120(%rsp,%rax), %r9
-; AVX512F-NEXT:    movq %r9, %rsi
-; AVX512F-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -104(%rsp,%rax), %r8
-; AVX512F-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512F-NEXT:    movq -96(%rsp,%rax), %r10
-; AVX512F-NEXT:    shrdq %cl, %r10, %r8
-; AVX512F-NEXT:    movq -88(%rsp,%rax), %r11
-; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rax), %rbx
-; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rax), %r15
-; AVX512F-NEXT:    shrdq %cl, %r15, %rbx
-; AVX512F-NEXT:    shrdq %cl, %r9, %r14
+; AVX512F-NEXT:    vmovdqu64 -128(%rsp,%rax), %zmm2
+; AVX512F-NEXT:    vpsrlq %xmm0, %zmm2, %zmm3
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    shrxq %rcx, %r15, %rcx
-; AVX512F-NEXT:    movq %rcx, 56(%rdi)
-; AVX512F-NEXT:    movq %rbx, 48(%rdi)
-; AVX512F-NEXT:    movq %r11, 40(%rdi)
-; AVX512F-NEXT:    movq %r10, 32(%rdi)
-; AVX512F-NEXT:    movq %r8, 24(%rdi)
-; AVX512F-NEXT:    movq %rdx, 16(%rdi)
-; AVX512F-NEXT:    movq %rsi, 8(%rdi)
-; AVX512F-NEXT:    movq %r14, (%rdi)
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT:    popq %rcx
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    pushq %rax
 ; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
@@ -498,91 +402,52 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movl %eax, %ecx
 ; AVX512VL-NEXT:    andl $63, %ecx
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm0
 ; AVX512VL-NEXT:    shrl $3, %eax
 ; AVX512VL-NEXT:    andl $56, %eax
-; AVX512VL-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT:    movq -120(%rsp,%rax), %r9
-; AVX512VL-NEXT:    movq %r9, %rsi
-; AVX512VL-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq -104(%rsp,%rax), %r8
-; AVX512VL-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT:    movq -96(%rsp,%rax), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r10, %r8
-; AVX512VL-NEXT:    movq -88(%rsp,%rax), %r11
-; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rax), %rbx
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rax), %r14
-; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rax), %r15
-; AVX512VL-NEXT:    shrdq %cl, %r9, %r15
+; AVX512VL-NEXT:    vmovdqu64 -128(%rsp,%rax), %zmm1
+; AVX512VL-NEXT:    vpsrlq %xmm0, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VL-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VL-NEXT:    movq %r11, 40(%rdi)
-; AVX512VL-NEXT:    movq %r10, 32(%rdi)
-; AVX512VL-NEXT:    movq %r8, 24(%rdi)
-; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
-; AVX512VL-NEXT:    movq %rsi, 8(%rdi)
-; AVX512VL-NEXT:    movq %r15, (%rdi)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT:    popq %rcx
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    pushq %rax
+; AVX512VBMI-NEXT:    movq %rdi, %rax
 ; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512VBMI-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %eax, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %eax
-; AVX512VBMI-NEXT:    andl $56, %eax
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rax), %r9
-; AVX512VBMI-NEXT:    movq %r9, %rsi
-; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rax), %r8
-; AVX512VBMI-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rax), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rax), %r11
-; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rax), %r14
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rax), %r15
-; AVX512VBMI-NEXT:    shrdq %cl, %r9, %r15
-; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT:    movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT:    movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %r15, (%rdi)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX512VBMI-NEXT:    shrl $3, %edi
+; AVX512VBMI-NEXT:    andl $56, %edi
+; AVX512VBMI-NEXT:    vmovdqu64 -128(%rsp,%rdi), %zmm1
+; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm1, 32(%rax)
+; AVX512VBMI-NEXT:    vmovdqu %ymm1, (%rax)
+; AVX512VBMI-NEXT:    popq %rcx
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %r = lshr i512 %a0, %a1
@@ -711,14 +576,12 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: ashr_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
@@ -735,49 +598,32 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movl %eax, %ecx
 ; AVX512F-NEXT:    andl $63, %ecx
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
 ; AVX512F-NEXT:    shrl $3, %eax
 ; AVX512F-NEXT:    andl $56, %eax
-; AVX512F-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT:    movq -120(%rsp,%rax), %r9
-; AVX512F-NEXT:    movq %r9, %rsi
-; AVX512F-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -104(%rsp,%rax), %r8
-; AVX512F-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512F-NEXT:    movq -96(%rsp,%rax), %r10
-; AVX512F-NEXT:    shrdq %cl, %r10, %r8
-; AVX512F-NEXT:    movq -88(%rsp,%rax), %r11
-; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rax), %rbx
-; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rax), %r15
-; AVX512F-NEXT:    shrdq %cl, %r15, %rbx
-; AVX512F-NEXT:    shrdq %cl, %r9, %r14
+; AVX512F-NEXT:    vmovdqu64 -128(%rsp,%rax), %zmm1
+; AVX512F-NEXT:    vpsrlq %xmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vpsraq $63, -72(%rsp,%rax){1to8}, %zmm3
+; AVX512F-NEXT:    valignq {{.*#+}} zmm3 = zmm3[7,0,1,2,3,4,5,6]
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    sarxq %rcx, %r15, %rcx
-; AVX512F-NEXT:    movq %rcx, 56(%rdi)
-; AVX512F-NEXT:    movq %rbx, 48(%rdi)
-; AVX512F-NEXT:    movq %r11, 40(%rdi)
-; AVX512F-NEXT:    movq %r10, 32(%rdi)
-; AVX512F-NEXT:    movq %r8, 24(%rdi)
-; AVX512F-NEXT:    movq %rdx, 16(%rdi)
-; AVX512F-NEXT:    movq %rsi, 8(%rdi)
-; AVX512F-NEXT:    movq %r14, (%rdi)
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT:    popq %rcx
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: ashr_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    pushq %rax
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
 ; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512VL-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
@@ -793,97 +639,62 @@ define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
 ; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; AVX512VL-NEXT:    movl %eax, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %eax
-; AVX512VL-NEXT:    andl $56, %eax
-; AVX512VL-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT:    movq -120(%rsp,%rax), %r9
-; AVX512VL-NEXT:    movq %r9, %rsi
-; AVX512VL-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq -104(%rsp,%rax), %r8
-; AVX512VL-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT:    movq -96(%rsp,%rax), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r10, %r8
-; AVX512VL-NEXT:    movq -88(%rsp,%rax), %r11
-; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rax), %rbx
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rax), %r14
-; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rax), %r15
-; AVX512VL-NEXT:    shrdq %cl, %r9, %r15
+; AVX512VL-NEXT:    shrl $3, %ecx
+; AVX512VL-NEXT:    andl $56, %ecx
+; AVX512VL-NEXT:    vpsraq $63, -72(%rsp,%rcx){1to8}, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 -128(%rsp,%rcx), %zmm1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,2,3,4,5,6,7,15]
+; AVX512VL-NEXT:    vpermi2q %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpaddq %zmm2, %zmm2, %zmm0
+; AVX512VL-NEXT:    andl $63, %eax
+; AVX512VL-NEXT:    vpbroadcastq %rax, %xmm2
+; AVX512VL-NEXT:    vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm2, %xmm3
+; AVX512VL-NEXT:    vpsllq %xmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm1, %zmm1
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    sarxq %rcx, %r14, %rcx
-; AVX512VL-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VL-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VL-NEXT:    movq %r11, 40(%rdi)
-; AVX512VL-NEXT:    movq %r10, 32(%rdi)
-; AVX512VL-NEXT:    movq %r8, 24(%rdi)
-; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
-; AVX512VL-NEXT:    movq %rsi, 8(%rdi)
-; AVX512VL-NEXT:    movq %r15, (%rdi)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT:    popq %rcx
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: ashr_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    pushq %rax
+; AVX512VBMI-NEXT:    movq %rdi, %rax
 ; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
 ; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    sarq $63, %r10
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %eax, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %eax
-; AVX512VBMI-NEXT:    andl $56, %eax
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rax), %rdx
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rax), %r9
-; AVX512VBMI-NEXT:    movq %r9, %rsi
-; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rax), %r8
-; AVX512VBMI-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rax), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rax), %r11
-; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rax), %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rax), %r14
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rax), %r15
-; AVX512VBMI-NEXT:    shrdq %cl, %r9, %r15
-; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    sarxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT:    movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT:    movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %r15, (%rdi)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    sarq $63, %rdi
+; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vpbroadcastq %r10, %zmm0
+; AVX512VBMI-NEXT:    # kill: def $r10d killed $r10d killed $r10 def $r10
+; AVX512VBMI-NEXT:    shrl $3, %r10d
+; AVX512VBMI-NEXT:    andl $56, %r10d
+; AVX512VBMI-NEXT:    vpsraq $63, -72(%rsp,%r10){1to8}, %zmm1
+; AVX512VBMI-NEXT:    vmovdqu64 -128(%rsp,%r10), %zmm2
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,2,3,4,5,6,7,15]
+; AVX512VBMI-NEXT:    vpermi2q %zmm1, %zmm2, %zmm3
+; AVX512VBMI-NEXT:    vpshrdvq %zmm0, %zmm3, %zmm2
+; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm2, 32(%rax)
+; AVX512VBMI-NEXT:    vmovdqu %ymm2, (%rax)
+; AVX512VBMI-NEXT:    popq %rcx
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %r = ashr i512 %a0, %a1
   ret i512 %r
@@ -1008,162 +819,64 @@ define i512 @shl_i512_load(ptr %p0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: shl_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vmovups (%rsi), %zmm0
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %edx, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %edx
-; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    negl %edx
-; AVX512F-NEXT:    movslq %edx, %r8
-; AVX512F-NEXT:    movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT:    movq -48(%rsp,%r8), %rax
-; AVX512F-NEXT:    movq %rax, %rsi
-; AVX512F-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -40(%rsp,%r8), %r10
-; AVX512F-NEXT:    movq %r10, %r9
-; AVX512F-NEXT:    shldq %cl, %rax, %r9
-; AVX512F-NEXT:    movq -32(%rsp,%r8), %rax
-; AVX512F-NEXT:    movq %rax, %r11
-; AVX512F-NEXT:    shldq %cl, %r10, %r11
-; AVX512F-NEXT:    movq -24(%rsp,%r8), %r10
-; AVX512F-NEXT:    movq %r10, %rbx
-; AVX512F-NEXT:    shldq %cl, %rax, %rbx
-; AVX512F-NEXT:    movq -16(%rsp,%r8), %rax
-; AVX512F-NEXT:    movq %rax, %r14
-; AVX512F-NEXT:    shldq %cl, %r10, %r14
-; AVX512F-NEXT:    movq -8(%rsp,%r8), %r10
-; AVX512F-NEXT:    shldq %cl, %rax, %r10
+; AVX512F-NEXT:    movl %edx, %eax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    shrl $6, %edx
+; AVX512F-NEXT:    movl $-1, %eax
+; AVX512F-NEXT:    shlxl %edx, %eax, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpexpandq (%rsi), %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpsllq %xmm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; AVX512F-NEXT:    vpsrlq $1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq -64(%rsp,%r8), %rdi
-; AVX512F-NEXT:    shlxq %rcx, %rdi, %r8
-; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT:    shldq %cl, %rdi, %rdx
-; AVX512F-NEXT:    movq %r10, 56(%rax)
-; AVX512F-NEXT:    movq %r14, 48(%rax)
-; AVX512F-NEXT:    movq %rbx, 40(%rax)
-; AVX512F-NEXT:    movq %r11, 32(%rax)
-; AVX512F-NEXT:    movq %r9, 24(%rax)
-; AVX512F-NEXT:    movq %rsi, 16(%rax)
-; AVX512F-NEXT:    movq %rdx, 8(%rax)
-; AVX512F-NEXT:    movq %r8, (%rax)
-; AVX512F-NEXT:    addq $8, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shl_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VL-NEXT:    vmovups 32(%rsi), %ymm1
-; AVX512VL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %edx, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %edx
-; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    negl %edx
-; AVX512VL-NEXT:    movslq %edx, %r9
-; AVX512VL-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT:    movq -48(%rsp,%r9), %rax
-; AVX512VL-NEXT:    movq %rax, %rsi
-; AVX512VL-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq -40(%rsp,%r9), %r10
-; AVX512VL-NEXT:    movq %r10, %r8
-; AVX512VL-NEXT:    shldq %cl, %rax, %r8
-; AVX512VL-NEXT:    movq -32(%rsp,%r9), %r11
-; AVX512VL-NEXT:    movq %r11, %rbx
-; AVX512VL-NEXT:    shldq %cl, %r10, %rbx
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq -24(%rsp,%r9), %rdi
-; AVX512VL-NEXT:    movq %rdi, %r10
-; AVX512VL-NEXT:    shldq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT:    movq -16(%rsp,%r9), %r14
-; AVX512VL-NEXT:    movq %r14, %r15
-; AVX512VL-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VL-NEXT:    movq -8(%rsp,%r9), %rdi
-; AVX512VL-NEXT:    shldq %cl, %r14, %rdi
-; AVX512VL-NEXT:    shlxq %rcx, %r11, %r9
-; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT:    shldq %cl, %r11, %rdx
-; AVX512VL-NEXT:    movq %rdi, 56(%rax)
-; AVX512VL-NEXT:    movq %r15, 48(%rax)
-; AVX512VL-NEXT:    movq %r10, 40(%rax)
-; AVX512VL-NEXT:    movq %rbx, 32(%rax)
-; AVX512VL-NEXT:    movq %r8, 24(%rax)
-; AVX512VL-NEXT:    movq %rsi, 16(%rax)
-; AVX512VL-NEXT:    movq %rdx, 8(%rax)
-; AVX512VL-NEXT:    movq %r9, (%rax)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    movl %edx, %ecx
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    shrl $6, %edx
+; AVX512VL-NEXT:    movl $-1, %ecx
+; AVX512VL-NEXT:    shlxl %edx, %ecx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpexpandq (%rsi), %zmm2 {%k1} {z}
+; AVX512VL-NEXT:    vpand %xmm1, %xmm0, %xmm3
+; AVX512VL-NEXT:    vpsllq %xmm3, %zmm2, %zmm3
+; AVX512VL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm1[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VL-NEXT:    vpsrlq $1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vporq %zmm0, %zmm3, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shl_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT:    vmovups 32(%rsi), %ymm1
-; AVX512VBMI-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %edx, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %edx
-; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    negl %edx
-; AVX512VBMI-NEXT:    movslq %edx, %r9
-; AVX512VBMI-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT:    movq -48(%rsp,%r9), %rax
-; AVX512VBMI-NEXT:    movq %rax, %rsi
-; AVX512VBMI-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq -40(%rsp,%r9), %r10
-; AVX512VBMI-NEXT:    movq %r10, %r8
-; AVX512VBMI-NEXT:    shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT:    movq -32(%rsp,%r9), %r11
-; AVX512VBMI-NEXT:    movq %r11, %rbx
-; AVX512VBMI-NEXT:    shldq %cl, %r10, %rbx
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq -24(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT:    movq %rdi, %r10
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT:    movq -16(%rsp,%r9), %r14
-; AVX512VBMI-NEXT:    movq %r14, %r15
-; AVX512VBMI-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT:    movq -8(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT:    shldq %cl, %r14, %rdi
-; AVX512VBMI-NEXT:    shlxq %rcx, %r11, %r9
-; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT:    movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT:    movq %r15, 48(%rax)
-; AVX512VBMI-NEXT:    movq %r10, 40(%rax)
-; AVX512VBMI-NEXT:    movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT:    movq %r8, 24(%rax)
-; AVX512VBMI-NEXT:    movq %rsi, 16(%rax)
-; AVX512VBMI-NEXT:    movq %rdx, 8(%rax)
-; AVX512VBMI-NEXT:    movq %r9, (%rax)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    movl %edx, %ecx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    movl $-1, %edi
+; AVX512VBMI-NEXT:    shlxl %ecx, %edi, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpexpandq (%rsi), %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vpbroadcastq %rdx, %zmm1
+; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm2[7],zmm0[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT:    vpshldvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -1276,141 +989,67 @@ define i512 @lshr_i512_load(ptr %p0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: lshr_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vmovups (%rsi), %zmm0
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %edx, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %edx
-; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512F-NEXT:    movq -120(%rsp,%rdx), %rax
-; AVX512F-NEXT:    movq %rax, %r8
-; AVX512F-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512F-NEXT:    movq -104(%rsp,%rdx), %r9
-; AVX512F-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512F-NEXT:    movq -96(%rsp,%rdx), %r10
-; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -88(%rsp,%rdx), %r11
-; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rdx), %rbx
-; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rdx), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rdx), %rdx
-; AVX512F-NEXT:    shrdq %cl, %rdx, %rbx
-; AVX512F-NEXT:    shrdq %cl, %rax, %r14
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT:    movl %edx, %eax
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT:    shrl $6, %edx
+; AVX512F-NEXT:    movl $-1, %eax
+; AVX512F-NEXT:    shlxl %edx, %eax, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm3, %zmm0, %zmm3
+; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512F-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    shrxq %rcx, %rdx, %rcx
-; AVX512F-NEXT:    movq %rcx, 56(%rdi)
-; AVX512F-NEXT:    movq %rbx, 48(%rdi)
-; AVX512F-NEXT:    movq %r11, 40(%rdi)
-; AVX512F-NEXT:    movq %r10, 32(%rdi)
-; AVX512F-NEXT:    movq %r9, 24(%rdi)
-; AVX512F-NEXT:    movq %rsi, 16(%rdi)
-; AVX512F-NEXT:    movq %r8, 8(%rdi)
-; AVX512F-NEXT:    movq %r14, (%rdi)
-; AVX512F-NEXT:    addq $8, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VL-NEXT:    vmovups 32(%rsi), %ymm1
-; AVX512VL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %edx, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %edx
-; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512VL-NEXT:    movq -120(%rsp,%rdx), %rax
-; AVX512VL-NEXT:    movq %rax, %r8
-; AVX512VL-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%rdx), %r9
-; AVX512VL-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512VL-NEXT:    movq -96(%rsp,%rdx), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -88(%rsp,%rdx), %r11
-; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rdx), %rbx
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rdx), %r14
-; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rdx), %rdx
-; AVX512VL-NEXT:    shrdq %cl, %rax, %rdx
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VL-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VL-NEXT:    movq %r11, 40(%rdi)
-; AVX512VL-NEXT:    movq %r10, 32(%rdi)
-; AVX512VL-NEXT:    movq %r9, 24(%rdi)
-; AVX512VL-NEXT:    movq %rsi, 16(%rdi)
-; AVX512VL-NEXT:    movq %r8, 8(%rdi)
-; AVX512VL-NEXT:    movq %rdx, (%rdi)
-; AVX512VL-NEXT:    addq $8, %rsp
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT:    movl %edx, %ecx
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm2
+; AVX512VL-NEXT:    shrl $6, %edx
+; AVX512VL-NEXT:    movl $-1, %ecx
+; AVX512VL-NEXT:    shlxl %edx, %ecx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpand %xmm1, %xmm2, %xmm3
+; AVX512VL-NEXT:    vpsrlq %xmm3, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpandn %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VL-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT:    vmovups 32(%rsi), %ymm1
-; AVX512VBMI-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %edx, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %edx
-; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT:    movq %rax, %r8
-; AVX512VBMI-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rdx), %r9
-; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rdx), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rdx), %r11
-; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rdx), %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rdx), %r14
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rdx), %rdx
-; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rdx
+; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT:    movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT:    movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %rsi, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %rdx, (%rdi)
-; AVX512VBMI-NEXT:    addq $8, %rsp
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    movl %edx, %ecx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    movl $-1, %esi
+; AVX512VBMI-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vpbroadcastq %rdx, %zmm1
+; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -1539,174 +1178,76 @@ define i512 @ashr_i512_load(ptr %p0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: ashr_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vmovups (%rsi), %ymm0
-; AVX512F-NEXT:    vmovaps 32(%rsi), %xmm1
-; AVX512F-NEXT:    movq 48(%rsi), %rax
-; AVX512F-NEXT:    movq 56(%rsi), %rcx
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    sarq $63, %rcx
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %edx, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %edx
-; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512F-NEXT:    movq -120(%rsp,%rdx), %rax
-; AVX512F-NEXT:    movq %rax, %r8
-; AVX512F-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512F-NEXT:    movq -104(%rsp,%rdx), %r9
-; AVX512F-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512F-NEXT:    movq -96(%rsp,%rdx), %r10
-; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -88(%rsp,%rdx), %r11
-; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rdx), %rbx
-; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rdx), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rdx), %rdx
-; AVX512F-NEXT:    shrdq %cl, %rdx, %rbx
-; AVX512F-NEXT:    shrdq %cl, %rax, %r14
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    movl %edx, %eax
+; AVX512F-NEXT:    shrl $6, %edx
+; AVX512F-NEXT:    movl $-1, %ecx
+; AVX512F-NEXT:    shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm0, %xmm4
+; AVX512F-NEXT:    vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; AVX512F-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    sarxq %rcx, %rdx, %rcx
-; AVX512F-NEXT:    movq %rcx, 56(%rdi)
-; AVX512F-NEXT:    movq %rbx, 48(%rdi)
-; AVX512F-NEXT:    movq %r11, 40(%rdi)
-; AVX512F-NEXT:    movq %r10, 32(%rdi)
-; AVX512F-NEXT:    movq %r9, 24(%rdi)
-; AVX512F-NEXT:    movq %rsi, 16(%rdi)
-; AVX512F-NEXT:    movq %r8, 8(%rdi)
-; AVX512F-NEXT:    movq %r14, (%rdi)
-; AVX512F-NEXT:    addq $8, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: ashr_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VL-NEXT:    vmovaps 32(%rsi), %xmm1
-; AVX512VL-NEXT:    movq 48(%rsi), %rax
-; AVX512VL-NEXT:    movq 56(%rsi), %rcx
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    sarq $63, %rcx
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %edx, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %edx
-; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512VL-NEXT:    movq -120(%rsp,%rdx), %rax
-; AVX512VL-NEXT:    movq %rax, %r8
-; AVX512VL-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%rdx), %r9
-; AVX512VL-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512VL-NEXT:    movq -96(%rsp,%rdx), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -88(%rsp,%rdx), %r11
-; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rdx), %rbx
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rdx), %r14
-; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rdx), %rdx
-; AVX512VL-NEXT:    shrdq %cl, %rax, %rdx
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    sarxq %rcx, %r14, %rcx
-; AVX512VL-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VL-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VL-NEXT:    movq %r11, 40(%rdi)
-; AVX512VL-NEXT:    movq %r10, 32(%rdi)
-; AVX512VL-NEXT:    movq %r9, 24(%rdi)
-; AVX512VL-NEXT:    movq %rsi, 16(%rdi)
-; AVX512VL-NEXT:    movq %r8, 8(%rdi)
-; AVX512VL-NEXT:    movq %rdx, (%rdi)
-; AVX512VL-NEXT:    addq $8, %rsp
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT:    vpsraq $63, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    movl %edx, %ecx
+; AVX512VL-NEXT:    shrl $6, %edx
+; AVX512VL-NEXT:    movl $-1, %esi
+; AVX512VL-NEXT:    shlxl %edx, %esi, %edx
+; AVX512VL-NEXT:    kmovd %edx, %k1
+; AVX512VL-NEXT:    vmovdqa64 %zmm1, %zmm2
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm2 {%k1}
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT:    vpand %xmm0, %xmm3, %xmm4
+; AVX512VL-NEXT:    vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: ashr_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vmovups (%rsi), %ymm0
-; AVX512VBMI-NEXT:    vmovaps 32(%rsi), %xmm1
-; AVX512VBMI-NEXT:    movq 48(%rsi), %rax
-; AVX512VBMI-NEXT:    movq 56(%rsi), %rcx
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    sarq $63, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %edx, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %edx
-; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rdx), %rsi
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT:    movq %rax, %r8
-; AVX512VBMI-NEXT:    shrdq %cl, %rsi, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rdx), %r9
-; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rsi
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rdx), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rdx), %r11
-; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rdx), %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rdx), %r14
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rdx), %rdx
-; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rdx
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    sarxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT:    movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT:    movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %rsi, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %rdx, (%rdi)
-; AVX512VBMI-NEXT:    addq $8, %rsp
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [7,7,7,7,7,7,7,7]
+; AVX512VBMI-NEXT:    vpsraq $63, %zmm0, %zmm2
+; AVX512VBMI-NEXT:    vpermq %zmm2, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vpbroadcastq %rdx, %zmm2
+; AVX512VBMI-NEXT:    movl %edx, %ecx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    movl $-1, %edx
+; AVX512VBMI-NEXT:    shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm1, %zmm3
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm3 {%k1}
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm0 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm2, %zmm0, %zmm3
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm3, (%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -2223,160 +1764,39 @@ define i512 @shl_1_i512(i512 %a0) nounwind {
 ;
 ; AVX512F-LABEL: shl_1_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %esi, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %esi
-; AVX512F-NEXT:    andl $56, %esi
-; AVX512F-NEXT:    negl %esi
-; AVX512F-NEXT:    movslq %esi, %r8
-; AVX512F-NEXT:    movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT:    movq -48(%rsp,%r8), %rax
-; AVX512F-NEXT:    movq %rax, %rsi
-; AVX512F-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq -40(%rsp,%r8), %r10
-; AVX512F-NEXT:    movq %r10, %r9
-; AVX512F-NEXT:    shldq %cl, %rax, %r9
-; AVX512F-NEXT:    movq -32(%rsp,%r8), %rax
-; AVX512F-NEXT:    movq %rax, %r11
-; AVX512F-NEXT:    shldq %cl, %r10, %r11
-; AVX512F-NEXT:    movq -24(%rsp,%r8), %r10
-; AVX512F-NEXT:    movq %r10, %rbx
-; AVX512F-NEXT:    shldq %cl, %rax, %rbx
-; AVX512F-NEXT:    movq -16(%rsp,%r8), %rax
-; AVX512F-NEXT:    movq %rax, %r14
-; AVX512F-NEXT:    shldq %cl, %r10, %r14
-; AVX512F-NEXT:    movq -8(%rsp,%r8), %r10
-; AVX512F-NEXT:    shldq %cl, %rax, %r10
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq -64(%rsp,%r8), %rdi
-; AVX512F-NEXT:    shlxq %rcx, %rdi, %r8
-; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT:    shldq %cl, %rdi, %rdx
-; AVX512F-NEXT:    movq %r10, 56(%rax)
-; AVX512F-NEXT:    movq %r14, 48(%rax)
-; AVX512F-NEXT:    movq %rbx, 40(%rax)
-; AVX512F-NEXT:    movq %r11, 32(%rax)
-; AVX512F-NEXT:    movq %r9, 24(%rax)
-; AVX512F-NEXT:    movq %rsi, 16(%rax)
-; AVX512F-NEXT:    movq %rdx, 8(%rax)
-; AVX512F-NEXT:    movq %r8, (%rax)
-; AVX512F-NEXT:    addq $8, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    movl $1, %ecx
+; AVX512F-NEXT:    shlxq %rsi, %rcx, %rdx
+; AVX512F-NEXT:    shrl $6, %esi
+; AVX512F-NEXT:    shlxq %rsi, %rcx, %rcx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpbroadcastq %rdx, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shl_1_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %esi, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %esi
-; AVX512VL-NEXT:    andl $56, %esi
-; AVX512VL-NEXT:    negl %esi
-; AVX512VL-NEXT:    movslq %esi, %r9
-; AVX512VL-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT:    movq -48(%rsp,%r9), %rax
-; AVX512VL-NEXT:    movq %rax, %rsi
-; AVX512VL-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    movq -40(%rsp,%r9), %r10
-; AVX512VL-NEXT:    movq %r10, %r8
-; AVX512VL-NEXT:    shldq %cl, %rax, %r8
-; AVX512VL-NEXT:    movq -32(%rsp,%r9), %r11
-; AVX512VL-NEXT:    movq %r11, %rbx
-; AVX512VL-NEXT:    shldq %cl, %r10, %rbx
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq -24(%rsp,%r9), %rdi
-; AVX512VL-NEXT:    movq %rdi, %r10
-; AVX512VL-NEXT:    shldq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT:    movq -16(%rsp,%r9), %r14
-; AVX512VL-NEXT:    movq %r14, %r15
-; AVX512VL-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VL-NEXT:    movq -8(%rsp,%r9), %rdi
-; AVX512VL-NEXT:    shldq %cl, %r14, %rdi
-; AVX512VL-NEXT:    shlxq %rcx, %r11, %r9
-; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT:    shldq %cl, %r11, %rdx
-; AVX512VL-NEXT:    movq %rdi, 56(%rax)
-; AVX512VL-NEXT:    movq %r15, 48(%rax)
-; AVX512VL-NEXT:    movq %r10, 40(%rax)
-; AVX512VL-NEXT:    movq %rbx, 32(%rax)
-; AVX512VL-NEXT:    movq %r8, 24(%rax)
-; AVX512VL-NEXT:    movq %rsi, 16(%rax)
-; AVX512VL-NEXT:    movq %rdx, 8(%rax)
-; AVX512VL-NEXT:    movq %r9, (%rax)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    movl $1, %ecx
+; AVX512VL-NEXT:    shlxq %rsi, %rcx, %rdx
+; AVX512VL-NEXT:    shrl $6, %esi
+; AVX512VL-NEXT:    shlxq %rsi, %rcx, %rcx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpbroadcastq %rdx, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shl_1_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %esi, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %esi
-; AVX512VBMI-NEXT:    andl $56, %esi
-; AVX512VBMI-NEXT:    negl %esi
-; AVX512VBMI-NEXT:    movslq %esi, %r9
-; AVX512VBMI-NEXT:    movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT:    movq -48(%rsp,%r9), %rax
-; AVX512VBMI-NEXT:    movq %rax, %rsi
-; AVX512VBMI-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    movq -40(%rsp,%r9), %r10
-; AVX512VBMI-NEXT:    movq %r10, %r8
-; AVX512VBMI-NEXT:    shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT:    movq -32(%rsp,%r9), %r11
-; AVX512VBMI-NEXT:    movq %r11, %rbx
-; AVX512VBMI-NEXT:    shldq %cl, %r10, %rbx
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq -24(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT:    movq %rdi, %r10
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT:    movq -16(%rsp,%r9), %r14
-; AVX512VBMI-NEXT:    movq %r14, %r15
-; AVX512VBMI-NEXT:    shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT:    movq -8(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT:    shldq %cl, %r14, %rdi
-; AVX512VBMI-NEXT:    shlxq %rcx, %r11, %r9
-; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT:    movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT:    movq %r15, 48(%rax)
-; AVX512VBMI-NEXT:    movq %r10, 40(%rax)
-; AVX512VBMI-NEXT:    movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT:    movq %r8, 24(%rax)
-; AVX512VBMI-NEXT:    movq %rsi, 16(%rax)
-; AVX512VBMI-NEXT:    movq %rdx, 8(%rax)
-; AVX512VBMI-NEXT:    movq %r9, (%rax)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    movl $1, %ecx
+; AVX512VBMI-NEXT:    shlxq %rsi, %rcx, %rdx
+; AVX512VBMI-NEXT:    shrl $6, %esi
+; AVX512VBMI-NEXT:    shlxq %rsi, %rcx, %rcx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpbroadcastq %rdx, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %r = shl i512 1, %a0
@@ -2485,139 +1905,67 @@ define i512 @lshr_signbit_i512(i512 %a0) nounwind {
 ;
 ; AVX512F-LABEL: lshr_signbit_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %esi, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %esi
-; AVX512F-NEXT:    andl $56, %esi
-; AVX512F-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512F-NEXT:    movq -120(%rsp,%rsi), %rax
-; AVX512F-NEXT:    movq %rax, %r8
-; AVX512F-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512F-NEXT:    movq -104(%rsp,%rsi), %r9
-; AVX512F-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512F-NEXT:    movq -96(%rsp,%rsi), %r10
-; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -88(%rsp,%rsi), %r11
-; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rsi), %rbx
-; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rsi), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rsi), %rsi
-; AVX512F-NEXT:    shrdq %cl, %rsi, %rbx
-; AVX512F-NEXT:    shrdq %cl, %rax, %r14
+; AVX512F-NEXT:    movl %esi, %eax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    shrl $6, %esi
+; AVX512F-NEXT:    movl $-1, %eax
+; AVX512F-NEXT:    shlxl %esi, %eax, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512F-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    shrxq %rcx, %rsi, %rcx
-; AVX512F-NEXT:    movq %rcx, 56(%rdi)
-; AVX512F-NEXT:    movq %rbx, 48(%rdi)
-; AVX512F-NEXT:    movq %r11, 40(%rdi)
-; AVX512F-NEXT:    movq %r10, 32(%rdi)
-; AVX512F-NEXT:    movq %r9, 24(%rdi)
-; AVX512F-NEXT:    movq %rdx, 16(%rdi)
-; AVX512F-NEXT:    movq %r8, 8(%rdi)
-; AVX512F-NEXT:    movq %r14, (%rdi)
-; AVX512F-NEXT:    addq $8, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_signbit_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %esi, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %esi
-; AVX512VL-NEXT:    andl $56, %esi
-; AVX512VL-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512VL-NEXT:    movq -120(%rsp,%rsi), %rax
-; AVX512VL-NEXT:    movq %rax, %r8
-; AVX512VL-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%rsi), %r9
-; AVX512VL-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT:    movq -96(%rsp,%rsi), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -88(%rsp,%rsi), %r11
-; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rsi), %rbx
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rsi), %r14
-; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rsi), %rsi
-; AVX512VL-NEXT:    shrdq %cl, %rax, %rsi
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    shrxq %rcx, %r14, %rcx
-; AVX512VL-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VL-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VL-NEXT:    movq %r11, 40(%rdi)
-; AVX512VL-NEXT:    movq %r10, 32(%rdi)
-; AVX512VL-NEXT:    movq %r9, 24(%rdi)
-; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
-; AVX512VL-NEXT:    movq %r8, 8(%rdi)
-; AVX512VL-NEXT:    movq %rsi, (%rdi)
-; AVX512VL-NEXT:    addq $8, %rsp
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    movl %esi, %ecx
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    shrl $6, %esi
+; AVX512VL-NEXT:    movl $-1, %ecx
+; AVX512VL-NEXT:    shlxl %esi, %ecx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm2 {%k1} {z}
+; AVX512VL-NEXT:    vpand %xmm1, %xmm0, %xmm3
+; AVX512VL-NEXT:    vpsrlq %xmm3, %zmm2, %zmm3
+; AVX512VL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_signbit_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %esi, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %esi
-; AVX512VBMI-NEXT:    andl $56, %esi
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rsi), %rax
-; AVX512VBMI-NEXT:    movq %rax, %r8
-; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rsi), %r9
-; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rsi), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rsi), %r11
-; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rsi), %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rsi), %r14
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rsi), %rsi
-; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rsi
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    shrxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT:    movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT:    movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %rsi, (%rdi)
-; AVX512VBMI-NEXT:    addq $8, %rsp
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    movl %esi, %ecx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    movl $-1, %edx
+; AVX512VBMI-NEXT:    shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vpbroadcastq %rsi, %zmm1
+; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %s = shl i512 1, 511
@@ -2732,141 +2080,70 @@ define i512 @ashr_signbit_i512(i512 %a0) nounwind {
 ;
 ; AVX512F-LABEL: ashr_signbit_i512:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    movl %esi, %eax
+; AVX512F-NEXT:    shrl $6, %esi
+; AVX512F-NEXT:    movl $-1, %ecx
+; AVX512F-NEXT:    shlxl %esi, %ecx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
-; AVX512F-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %esi, %ecx
-; AVX512F-NEXT:    andl $63, %ecx
-; AVX512F-NEXT:    shrl $3, %esi
-; AVX512F-NEXT:    andl $56, %esi
-; AVX512F-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512F-NEXT:    movq -120(%rsp,%rsi), %rax
-; AVX512F-NEXT:    movq %rax, %r8
-; AVX512F-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512F-NEXT:    movq -104(%rsp,%rsi), %r9
-; AVX512F-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512F-NEXT:    movq -96(%rsp,%rsi), %r10
-; AVX512F-NEXT:    shrdq %cl, %r10, %r9
-; AVX512F-NEXT:    movq -88(%rsp,%rsi), %r11
-; AVX512F-NEXT:    shrdq %cl, %r11, %r10
-; AVX512F-NEXT:    movq -80(%rsp,%rsi), %rbx
-; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    movq -128(%rsp,%rsi), %r14
-; AVX512F-NEXT:    movq -72(%rsp,%rsi), %rsi
-; AVX512F-NEXT:    shrdq %cl, %rsi, %rbx
-; AVX512F-NEXT:    shrdq %cl, %rax, %r14
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovq %rax, %xmm1
+; AVX512F-NEXT:    vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; AVX512F-NEXT:    vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512F-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; AVX512F-NEXT:    valignq {{.*#+}} zmm0 = zmm2[1,2,3,4,5,6,7],zmm0[0]
+; AVX512F-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; AVX512F-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    sarxq %rcx, %rsi, %rcx
-; AVX512F-NEXT:    movq %rcx, 56(%rdi)
-; AVX512F-NEXT:    movq %rbx, 48(%rdi)
-; AVX512F-NEXT:    movq %r11, 40(%rdi)
-; AVX512F-NEXT:    movq %r10, 32(%rdi)
-; AVX512F-NEXT:    movq %r9, 24(%rdi)
-; AVX512F-NEXT:    movq %rdx, 16(%rdi)
-; AVX512F-NEXT:    movq %r8, 8(%rdi)
-; AVX512F-NEXT:    movq %r14, (%rdi)
-; AVX512F-NEXT:    addq $8, %rsp
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: ashr_signbit_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VL-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %esi, %ecx
-; AVX512VL-NEXT:    andl $63, %ecx
-; AVX512VL-NEXT:    shrl $3, %esi
-; AVX512VL-NEXT:    andl $56, %esi
-; AVX512VL-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512VL-NEXT:    movq -120(%rsp,%rsi), %rax
-; AVX512VL-NEXT:    movq %rax, %r8
-; AVX512VL-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512VL-NEXT:    movq -104(%rsp,%rsi), %r9
-; AVX512VL-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT:    movq -96(%rsp,%rsi), %r10
-; AVX512VL-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq -88(%rsp,%rsi), %r11
-; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VL-NEXT:    movq -80(%rsp,%rsi), %rbx
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    movq -72(%rsp,%rsi), %r14
-; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VL-NEXT:    movq -128(%rsp,%rsi), %rsi
-; AVX512VL-NEXT:    shrdq %cl, %rax, %rsi
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    sarxq %rcx, %r14, %rcx
-; AVX512VL-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VL-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VL-NEXT:    movq %r11, 40(%rdi)
-; AVX512VL-NEXT:    movq %r10, 32(%rdi)
-; AVX512VL-NEXT:    movq %r9, 24(%rdi)
-; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
-; AVX512VL-NEXT:    movq %r8, 8(%rdi)
-; AVX512VL-NEXT:    movq %rsi, (%rdi)
-; AVX512VL-NEXT:    addq $8, %rsp
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    movl %esi, %ecx
+; AVX512VL-NEXT:    shrl $6, %esi
+; AVX512VL-NEXT:    movl $-1, %edx
+; AVX512VL-NEXT:    shlxl %esi, %edx, %edx
+; AVX512VL-NEXT:    kmovd %edx, %k1
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm2 {%k1}
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT:    vpand %xmm0, %xmm3, %xmm4
+; AVX512VL-NEXT:    vpsrlq %xmm4, %zmm2, %zmm4
+; AVX512VL-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; AVX512VL-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: ashr_signbit_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX512VBMI-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %esi, %ecx
-; AVX512VBMI-NEXT:    andl $63, %ecx
-; AVX512VBMI-NEXT:    shrl $3, %esi
-; AVX512VBMI-NEXT:    andl $56, %esi
-; AVX512VBMI-NEXT:    movq -112(%rsp,%rsi), %rdx
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rsi), %rax
-; AVX512VBMI-NEXT:    movq %rax, %r8
-; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %r8
-; AVX512VBMI-NEXT:    movq -104(%rsp,%rsi), %r9
-; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT:    movq -96(%rsp,%rsi), %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq -88(%rsp,%rsi), %r11
-; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
-; AVX512VBMI-NEXT:    movq -80(%rsp,%rsi), %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    movq -72(%rsp,%rsi), %r14
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rsi), %rsi
-; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rsi
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    sarxq %rcx, %r14, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, 56(%rdi)
-; AVX512VBMI-NEXT:    movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT:    movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT:    movq %r9, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %r8, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %rsi, (%rdi)
-; AVX512VBMI-NEXT:    addq $8, %rsp
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    movl %esi, %ecx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    movl $-1, %edx
+; AVX512VBMI-NEXT:    shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT:    vpbroadcastq %rsi, %zmm0
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm1, (%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %s = shl i512 1, 511
@@ -3099,63 +2376,51 @@ define i64 @lshr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: lshr_extract_load_i512_i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rax
 ; AVX512F-NEXT:    movq %rsi, %rcx
-; AVX512F-NEXT:    vmovups (%rdi), %zmm0
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %ecx, %edx
-; AVX512F-NEXT:    shrl $3, %edx
-; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512F-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    movl %ecx, %eax
+; AVX512F-NEXT:    shrl $6, %eax
+; AVX512F-NEXT:    movl $-1, %edx
+; AVX512F-NEXT:    shlxl %eax, %edx, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rax
 ; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512F-NEXT:    popq %rcx
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_extract_load_i512_i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovups 32(%rdi), %ymm1
 ; AVX512VL-NEXT:    movq %rsi, %rcx
-; AVX512VL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %ecx, %edx
-; AVX512VL-NEXT:    shrl $3, %edx
-; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512VL-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512VL-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT:    movl %ecx, %eax
+; AVX512VL-NEXT:    shrl $6, %eax
+; AVX512VL-NEXT:    movl $-1, %edx
+; AVX512VL-NEXT:    shlxl %eax, %edx, %eax
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512VL-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT:    popq %rcx
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_extract_load_i512_i64:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT:    vmovups 32(%rdi), %ymm1
 ; AVX512VBMI-NEXT:    movq %rsi, %rcx
-; AVX512VBMI-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %ecx, %edx
-; AVX512VBMI-NEXT:    shrl $3, %edx
-; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VBMI-NEXT:    movl %ecx, %eax
+; AVX512VBMI-NEXT:    shrl $6, %eax
+; AVX512VBMI-NEXT:    movl $-1, %edx
+; AVX512VBMI-NEXT:    shlxl %eax, %edx, %eax
+; AVX512VBMI-NEXT:    kmovd %eax, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rax
 ; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT:    popq %rcx
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -3232,96 +2497,60 @@ define i64 @ashr_extract_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: ashr_extract_load_i512_i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rax
 ; AVX512F-NEXT:    movq %rsi, %rcx
-; AVX512F-NEXT:    vmovups (%rdi), %ymm0
-; AVX512F-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX512F-NEXT:    movq 48(%rdi), %rax
-; AVX512F-NEXT:    movq 56(%rdi), %rdx
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    sarq $63, %rdx
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %ecx, %edx
-; AVX512F-NEXT:    shrl $3, %edx
-; AVX512F-NEXT:    andl $56, %edx
-; AVX512F-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512F-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    movl %ecx, %eax
+; AVX512F-NEXT:    shrl $6, %eax
+; AVX512F-NEXT:    movl $-1, %edx
+; AVX512F-NEXT:    shlxl %eax, %edx, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512F-NEXT:    vmovq %xmm1, %rax
 ; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512F-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512F-NEXT:    popq %rcx
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: ashr_extract_load_i512_i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rax
+; AVX512VL-NEXT:    vmovdqu64 (%rdi), %zmm0
 ; AVX512VL-NEXT:    movq %rsi, %rcx
-; AVX512VL-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX512VL-NEXT:    movq 48(%rdi), %rax
-; AVX512VL-NEXT:    movq 56(%rdi), %rdx
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    sarq $63, %rdx
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %ecx, %edx
-; AVX512VL-NEXT:    shrl $3, %edx
-; AVX512VL-NEXT:    andl $56, %edx
-; AVX512VL-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512VL-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512VL-NEXT:    vpsraq $63, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    movl %ecx, %eax
+; AVX512VL-NEXT:    shrl $6, %eax
+; AVX512VL-NEXT:    movl $-1, %edx
+; AVX512VL-NEXT:    shlxl %eax, %edx, %eax
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
 ; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512VL-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512VL-NEXT:    popq %rcx
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: ashr_extract_load_i512_i64:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rax
+; AVX512VBMI-NEXT:    vmovdqu64 (%rdi), %zmm0
 ; AVX512VBMI-NEXT:    movq %rsi, %rcx
-; AVX512VBMI-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX512VBMI-NEXT:    movq 48(%rdi), %rax
-; AVX512VBMI-NEXT:    movq 56(%rdi), %rdx
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    sarq $63, %rdx
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %ecx, %edx
-; AVX512VBMI-NEXT:    shrl $3, %edx
-; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rdx), %rax
-; AVX512VBMI-NEXT:    movq -120(%rsp,%rdx), %rdx
+; AVX512VBMI-NEXT:    vpsraq $63, %zmm0, %zmm1
+; AVX512VBMI-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VBMI-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; AVX512VBMI-NEXT:    movl %ecx, %eax
+; AVX512VBMI-NEXT:    shrl $6, %eax
+; AVX512VBMI-NEXT:    movl $-1, %edx
+; AVX512VBMI-NEXT:    shlxl %eax, %edx, %eax
+; AVX512VBMI-NEXT:    kmovd %eax, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512VBMI-NEXT:    vmovq %xmm1, %rax
 ; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rax
-; AVX512VBMI-NEXT:    popq %rcx
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -3370,45 +2599,33 @@ define i64 @lshr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: lshr_extract_idx_load_i512_i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vmovups (%rdi), %zmm0
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    andl $7, %esi
-; AVX512F-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512F-NEXT:    popq %rcx
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    movl $-1, %eax
+; AVX512F-NEXT:    shlxl %esi, %eax, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vmovq %xmm0, %rax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: lshr_extract_idx_load_i512_i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512VL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    andl $7, %esi
-; AVX512VL-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512VL-NEXT:    popq %rcx
+; AVX512VL-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT:    movl $-1, %eax
+; AVX512VL-NEXT:    shlxl %esi, %eax, %eax
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vmovq %xmm0, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: lshr_extract_idx_load_i512_i64:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT:    vmovups 32(%rdi), %ymm1
-; AVX512VBMI-NEXT:    vxorps %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    andl $7, %esi
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512VBMI-NEXT:    popq %rcx
+; AVX512VBMI-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VBMI-NEXT:    movl $-1, %eax
+; AVX512VBMI-NEXT:    shlxl %esi, %eax, %eax
+; AVX512VBMI-NEXT:    kmovd %eax, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rax
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -3474,78 +2691,42 @@ define i64 @ashr_extract_idx_load_i512_i64(ptr %p0, i512 %a1) nounwind {
 ;
 ; AVX512F-LABEL: ashr_extract_idx_load_i512_i64:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rax
-; AVX512F-NEXT:    vmovups (%rdi), %ymm0
-; AVX512F-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX512F-NEXT:    movq 48(%rdi), %rax
-; AVX512F-NEXT:    movq 56(%rdi), %rcx
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    sarq $63, %rcx
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    andl $7, %esi
-; AVX512F-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512F-NEXT:    popq %rcx
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512F-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; AVX512F-NEXT:    movl $-1, %eax
+; AVX512F-NEXT:    shlxl %esi, %eax, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovq %xmm1, %rax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: ashr_extract_idx_load_i512_i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rax
-; AVX512VL-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VL-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX512VL-NEXT:    movq 48(%rdi), %rax
-; AVX512VL-NEXT:    movq 56(%rdi), %rcx
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    sarq $63, %rcx
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    andl $7, %esi
-; AVX512VL-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512VL-NEXT:    popq %rcx
+; AVX512VL-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT:    vpsraq $63, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VL-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    movl $-1, %eax
+; AVX512VL-NEXT:    shlxl %esi, %eax, %eax
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vmovq %xmm1, %rax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: ashr_extract_idx_load_i512_i64:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rax
-; AVX512VBMI-NEXT:    vmovups (%rdi), %ymm0
-; AVX512VBMI-NEXT:    vmovaps 32(%rdi), %xmm1
-; AVX512VBMI-NEXT:    movq 48(%rdi), %rax
-; AVX512VBMI-NEXT:    movq 56(%rdi), %rcx
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    sarq $63, %rcx
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    andl $7, %esi
-; AVX512VBMI-NEXT:    movq -128(%rsp,%rsi,8), %rax
-; AVX512VBMI-NEXT:    popq %rcx
+; AVX512VBMI-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VBMI-NEXT:    vpsraq $63, %zmm0, %zmm1
+; AVX512VBMI-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; AVX512VBMI-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; AVX512VBMI-NEXT:    movl $-1, %eax
+; AVX512VBMI-NEXT:    shlxl %esi, %eax, %eax
+; AVX512VBMI-NEXT:    kmovd %eax, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT:    vmovq %xmm1, %rax
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 65b602801b365..e9ddc576c6cd8 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -11,10 +11,10 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX1,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-HAVE-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-NO-SHLD-HAVE-BMI2-AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-NO-SHLD-NO-BMI2,X64-NO-SHLD-NO-BMI2-AVX,X64-NO-BMI2-AVX512,X64-NO-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-NO-BMI2,X64-HAVE-SHLD-NO-BMI2,X64-HAVE-SHLD-NO-BMI2-AVX,X64-NO-BMI2-AVX512,X64-HAVE-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2,X64-NO-SHLD-HAVE-BMI2-AVX,X64-HAVE-BMI2-AVX512,X64-NO-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X64,X64-AVX,X64-AVX512,X64-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2,X64-HAVE-SHLD-HAVE-BMI2-AVX,X64-HAVE-BMI2-AVX512,X64-HAVE-SHLD-HAVE-BMI2-AVX512
 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-SSE2
 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-SSE2
 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+sse2,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-SSE2,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-SSE2
@@ -27,10 +27,10 @@
 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX1
 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX1
 ; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX1,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX1
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-HAVE-SHLD-NO-BMI2-AVX512
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-NO-SHLD-HAVE-BMI2-AVX512
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-NO-SHLD-NO-BMI2,X86-NO-SHLD-NO-BMI2-AVX,X86-NO-BMI2-AVX512,X86-NO-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,-bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-NO-BMI2,X86-HAVE-SHLD-NO-BMI2,X86-HAVE-SHLD-NO-BMI2-AVX,X86-NO-BMI2-AVX512,X86-HAVE-SHLD-NO-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,+slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2,X86-NO-SHLD-HAVE-BMI2-AVX,X86-HAVE-BMI2-AVX512,X86-NO-SHLD-HAVE-BMI2-AVX512
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mattr=+avx512vl,+bmi2,-slow-shld | FileCheck %s --check-prefixes=ALL,X86,X86-AVX,X86-AVX512,X86-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2,X86-HAVE-SHLD-HAVE-BMI2-AVX,X86-HAVE-BMI2-AVX512,X86-HAVE-SHLD-HAVE-BMI2-AVX512
 
 define void @lshr_4bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-LABEL: lshr_4bytes:
@@ -10617,262 +10617,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    retq
 ;
-; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-NO-SHLD-NO-BMI2-AVX512:       # %bb.0:
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl (%rsi), %r9d
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (,%r9,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %r9d
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -128(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -120(%rsp,%r9), %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %esi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq (%r8,%r8), %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r10, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -104(%rsp,%r9), %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r10, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -96(%rsp,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq (%r12,%r12), %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -112(%rsp,%r9), %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rbx, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    addq %r10, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -88(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r14, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -80(%rsp,%r9), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq (%rbp,%rbp), %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r13, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    addq %r14, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r12, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -72(%rsp,%r9), %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leaq (%r9,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %rbp, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    addq %rbx, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r8, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r9, 56(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rbx, 8(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r12, 48(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r15, 40(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r10, 16(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, 24(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rdi, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    addq $8, %rsp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    retq
-;
-; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-HAVE-SHLD-NO-BMI2-AVX512:       # %bb.0:
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl (%rsi), %edi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    leal (,%rdi,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %edi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -96(%rsp,%rdi), %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -104(%rsp,%rdi), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r9, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %rsi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -112(%rsp,%rdi), %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r10, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -80(%rsp,%rdi), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -88(%rsp,%rdi), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %r11, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -72(%rsp,%rdi), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -128(%rsp,%rdi), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -120(%rsp,%rdi), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rdi, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdq %cl, %rdi, %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, 56(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rsi, 32(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rax, 24(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r14, (%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    retq
-;
-; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-NO-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -120(%rsp,%rsi), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -112(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -104(%rsp,%rsi), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -96(%rsp,%rsi), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r9, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -88(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -80(%rsp,%rsi), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r14, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -72(%rsp,%rsi), %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leaq (%rsi,%rsi), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %r15, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rax, %r9, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r10, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %rsi, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rcx, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rax, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r15, 48(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r13, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r8, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rdi, (%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    retq
-;
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r9, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -112(%rsp,%rax), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r10, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r11, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -72(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -120(%rsp,%rax), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rax, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rcx, %r11, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdq %cl, %rax, %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rdi, 32(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rsi, 24(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r14, (%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r10, 56(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    retq
+; X64-NO-BMI2-AVX512-LABEL: lshr_64bytes:
+; X64-NO-BMI2-AVX512:       # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    movl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT:    leal (,%rcx,8), %eax
+; X64-NO-BMI2-AVX512-NEXT:    vpbroadcastq %rax, %xmm1
+; X64-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; X64-NO-BMI2-AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; X64-NO-BMI2-AVX512-NEXT:    shrl $3, %ecx
+; X64-NO-BMI2-AVX512-NEXT:    movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT:    vpsrlq %xmm3, %zmm0, %zmm3
+; X64-NO-BMI2-AVX512-NEXT:    vpandn %xmm2, %xmm1, %xmm1
+; X64-NO-BMI2-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-NO-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X64-NO-BMI2-AVX512-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X64-NO-BMI2-AVX512-NEXT:    retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
+; X64-HAVE-BMI2-AVX512:       # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    leal (,%rax,8), %ecx
+; X64-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq %rcx, %xmm1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; X64-HAVE-BMI2-AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; X64-HAVE-BMI2-AVX512-NEXT:    shrl $3, %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT:    vpsrlq %xmm3, %zmm0, %zmm3
+; X64-HAVE-BMI2-AVX512-NEXT:    vpandn %xmm2, %xmm1, %xmm1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X64-HAVE-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X64-HAVE-BMI2-AVX512-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT:    retq
 ;
 ; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: lshr_64bytes:
 ; X86-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
@@ -12810,563 +12602,60 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    retl
 ;
-; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-NO-SHLD-NO-BMI2-AVX512:       # %bb.0:
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%ecx), %zmm0
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 68(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll $3, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $24, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 72(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (%eax,%eax), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %cl, %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (%edi,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (%edi,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (%edx,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (%ebx,%ebx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    addl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    popl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    popl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    popl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    popl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    retl
-;
-; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-HAVE-SHLD-NO-BMI2-AVX512:       # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shll $3, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    addl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    retl
-;
-; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-NO-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%ecx), %zmm0
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%eax), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%ecx,8), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $24, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $60, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 68(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 72(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %ebp, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 80(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 76(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 88(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 84(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 96(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 92(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 104(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 100(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 112(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 108(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 120(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 116(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 124(%esp,%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, 60(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, 56(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 48(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, 52(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebp, 40(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 44(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 32(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 36(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 24(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 16(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 20(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 8(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, (%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 4(%ecx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    retl
-;
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    subl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shll $3, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    addl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    retl
+; X86-NO-BMI2-AVX512-LABEL: lshr_64bytes:
+; X86-NO-BMI2-AVX512:       # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%ecx), %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    movl (%edx), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NO-BMI2-AVX512-NEXT:    vpslld $3, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,0,63,0]
+; X86-NO-BMI2-AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; X86-NO-BMI2-AVX512-NEXT:    shrl $3, %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %edx
+; X86-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT:    vpsrlq %xmm3, %zmm0, %zmm3
+; X86-NO-BMI2-AVX512-NEXT:    vpandn %xmm2, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X86-NO-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X86-NO-BMI2-AVX512-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X86-NO-BMI2-AVX512-NEXT:    retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: lshr_64bytes:
+; X86-HAVE-BMI2-AVX512:       # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    movl (%ecx), %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-HAVE-BMI2-AVX512-NEXT:    vpslld $3, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,0,63,0]
+; X86-HAVE-BMI2-AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; X86-HAVE-BMI2-AVX512-NEXT:    shrl $3, %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsrlq %xmm3, %zmm0, %zmm3
+; X86-HAVE-BMI2-AVX512-NEXT:    vpandn %xmm2, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; X86-HAVE-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; X86-HAVE-BMI2-AVX512-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vporq %zmm3, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
   %byteOff = load i512, ptr %byteOff.ptr, align 1
   %bitOff = shl i512 %byteOff, 3
@@ -13474,26 +12763,29 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
-; X64-AVX512-LABEL: lshr_64bytes_qwordOff:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    pushq %rax
-; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-AVX512-NEXT:    movl (%rsi), %eax
-; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    andl $7, %eax
-; X64-AVX512-NEXT:    vmovups -128(%rsp,%rax,8), %xmm0
-; X64-AVX512-NEXT:    vmovups -112(%rsp,%rax,8), %xmm1
-; X64-AVX512-NEXT:    vmovups -96(%rsp,%rax,8), %xmm2
-; X64-AVX512-NEXT:    vmovups -80(%rsp,%rax,8), %xmm3
-; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT:    popq %rax
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
+; X64-NO-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X64-NO-BMI2-AVX512:       # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT:    movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X64-NO-BMI2-AVX512-NEXT:    retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X64-HAVE-BMI2-AVX512:       # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: lshr_64bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
@@ -13682,29 +12974,35 @@ define void @lshr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
-; X86-AVX512-LABEL: lshr_64bytes_qwordOff:
-; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    subl $140, %esp
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT:    vmovups (%edx), %zmm0
-; X86-AVX512-NEXT:    movl (%ecx), %ecx
-; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT:    vmovups %zmm0, (%esp)
-; X86-AVX512-NEXT:    andl $7, %ecx
-; X86-AVX512-NEXT:    vmovups (%esp,%ecx,8), %xmm0
-; X86-AVX512-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
-; X86-AVX512-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
-; X86-AVX512-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
-; X86-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT:    addl $140, %esp
-; X86-AVX512-NEXT:    vzeroupper
-; X86-AVX512-NEXT:    retl
+; X86-NO-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X86-NO-BMI2-AVX512:       # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %edx
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X86-NO-BMI2-AVX512-NEXT:    retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: lshr_64bytes_qwordOff:
+; X86-HAVE-BMI2-AVX512:       # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
   %qwordOff = load i512, ptr %qwordOff.ptr, align 1
   %bitOff = shl i512 %qwordOff, 6
@@ -14606,267 +13904,52 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
 ; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    retq
 ;
-; X64-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-NO-SHLD-NO-BMI2-AVX512:       # %bb.0:
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl (%rsi), %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal (,%rcx,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %eax
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    negl %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movslq %ecx, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -24(%rsp,%r9), %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rdi, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %esi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -32(%rsp,%r9), %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -40(%rsp,%r9), %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rbx, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r11, %r10
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -48(%rsp,%r9), %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r15, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -64(%rsp,%r9), %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -56(%rsp,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r12, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r15, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r14, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r12, %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -16(%rsp,%r9), %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r12, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r13, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq -8(%rsp,%r9), %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r9
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    orq %r9, %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r14, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r12, 56(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rdi, 48(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r15, 8(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %rbx, 16(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, 24(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r10, 32(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    movq %r8, 40(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r12
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r13
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r14
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    popq %r15
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
-; X64-NO-SHLD-NO-BMI2-AVX512-NEXT:    retq
-;
-; X64-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-HAVE-SHLD-NO-BMI2-AVX512:       # %bb.0:
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    negl %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movslq %eax, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -24(%rsp,%r8), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r9, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -40(%rsp,%r8), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -48(%rsp,%r8), %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -56(%rsp,%r8), %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -16(%rsp,%r8), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r14, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq -8(%rsp,%r8), %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %r14, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shlq %cl, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldq %cl, %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r8, 56(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r15, 48(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rbx, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r10, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rdi, 24(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rax, 32(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %rsi, 40(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movq %r9, (%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
-; X64-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    retq
-;
-; X64-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-NO-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    negl %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movslq %esi, %rsi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -24(%rsp,%rsi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %rdi, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    notb %al
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -32(%rsp,%rsi), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r8, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r9, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -40(%rsp,%rsi), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r9, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r10, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -48(%rsp,%rsi), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r10, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r11, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -64(%rsp,%rsi), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -56(%rsp,%rsi), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r11, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r14, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %rbx, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r15, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -16(%rsp,%rsi), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r15, %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %r12, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, -8(%rsp,%rsi), %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxq %rax, %r15, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orq %rcx, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r14, (%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rdi, 48(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rbx, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r10, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r9, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r8, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addq $8, %rsp
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
-; X64-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    retq
-;
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    negl %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movslq %eax, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -32(%rsp,%r8), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -24(%rsp,%r8), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r9, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %rax, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -40(%rsp,%r8), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %rdi, %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -48(%rsp,%r8), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %r10, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -64(%rsp,%r8), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -56(%rsp,%r8), %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %rbx, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -16(%rsp,%r8), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r14, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %r9, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq -8(%rsp,%r8), %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %r14, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxq %rcx, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldq %cl, %r11, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r8, 56(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r15, 48(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rbx, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r10, 16(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rdi, 24(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rax, 32(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %rsi, 40(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movq %r9, (%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
-; X64-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    retq
+; X64-NO-BMI2-AVX512-LABEL: shl_64bytes:
+; X64-NO-BMI2-AVX512:       # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT:    movl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT:    leal (,%rcx,8), %eax
+; X64-NO-BMI2-AVX512-NEXT:    vpbroadcastq %rax, %xmm0
+; X64-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; X64-NO-BMI2-AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; X64-NO-BMI2-AVX512-NEXT:    shrl $3, %ecx
+; X64-NO-BMI2-AVX512-NEXT:    movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT:    vpexpandq (%rdi), %zmm3 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT:    vpsllq %xmm2, %zmm3, %zmm2
+; X64-NO-BMI2-AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; X64-NO-BMI2-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-NO-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X64-NO-BMI2-AVX512-NEXT:    vpsrlq $1, %zmm1, %zmm1
+; X64-NO-BMI2-AVX512-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X64-NO-BMI2-AVX512-NEXT:    retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
+; X64-HAVE-BMI2-AVX512:       # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    leal (,%rax,8), %ecx
+; X64-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq %rcx, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; X64-HAVE-BMI2-AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; X64-HAVE-BMI2-AVX512-NEXT:    shrl $3, %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpexpandq (%rdi), %zmm3 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT:    vpsllq %xmm2, %zmm3, %zmm2
+; X64-HAVE-BMI2-AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-HAVE-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X64-HAVE-BMI2-AVX512-NEXT:    vpsrlq $1, %zmm1, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT:    retq
 ;
 ; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: shl_64bytes:
 ; X86-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
@@ -16892,596 +15975,62 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
 ; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    retl
 ;
-; X86-NO-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-NO-SHLD-NO-BMI2-AVX512:       # %bb.0:
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%ecx), %zmm0
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $60, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    subl %edx, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl (%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 4(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll $3, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 12(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 8(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebp, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 20(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 16(%edi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebp, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 28(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 24(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %esi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 36(%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 32(%edx), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 44(%ebp), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 40(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 52(%ebp), %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    negl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 176(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 60(%edi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl 56(%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shrl %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    addl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    popl %esi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    popl %edi
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    popl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    popl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
-; X86-NO-SHLD-NO-BMI2-AVX512-NEXT:    retl
-;
-; X86-HAVE-SHLD-NO-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-HAVE-SHLD-NO-BMI2-AVX512:       # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl (%eax), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    subl %ebp, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 8(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 12(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shll $3, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 4(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 16(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 24(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 32(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 36(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %edi, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 40(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 44(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 60(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl (%eax), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    negl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl 160(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edi, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %ebx, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shll %cl, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    shldl %cl, %edi, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %esi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    movl %edx, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    addl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    popl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    vzeroupper
-; X86-HAVE-SHLD-NO-BMI2-AVX512-NEXT:    retl
-;
-; X86-NO-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-NO-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%ecx), %zmm0
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%eax,8), %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $24, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $60, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    subl %eax, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 4(%edx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    notb %bl
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 8(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 12(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %ecx, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 16(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 20(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 24(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 28(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edi, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 32(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 36(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edi, %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 40(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %esi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 44(%edx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 48(%edx), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 52(%edx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %esi, %ecx, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %esi, %ebp, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %eax, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %edi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 56(%edx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %ebp, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shrxl %ebx, %edi, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    negl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %ebp, 188(%esp,%ebx), %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    orl %ecx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, (%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 56(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebx, 60(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, 48(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 52(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 40(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 44(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 32(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 36(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 24(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 28(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 16(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 20(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 8(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 12(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %eax, 4(%edx)
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    addl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
-; X86-NO-SHLD-HAVE-BMI2-AVX512-NEXT:    retl
-;
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512:       # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    subl $204, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups (%ecx), %zmm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%eax), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    leal (,%ebx,8), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    andl $60, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    subl %ebx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 4(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 8(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 12(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 16(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 24(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %esi, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 32(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 36(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 40(%eax), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 44(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %ebp, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %esi, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 60(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl (%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    negl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl 176(%esp,%ebx), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edx, 56(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, 60(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %ebx, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    shldl %cl, %edx, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebx, 48(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %esi, 52(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ebp, 40(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, 44(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, 32(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, 24(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, 28(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, 16(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, 20(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, 8(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, 12(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %edi, 4(%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    movl %ecx, (%eax)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    addl $204, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    popl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    vzeroupper
-; X86-HAVE-SHLD-HAVE-BMI2-AVX512-NEXT:    retl
+; X86-NO-BMI2-AVX512-LABEL: shl_64bytes:
+; X86-NO-BMI2-AVX512:       # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT:    pushl %esi
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT:    movl (%esi), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NO-BMI2-AVX512-NEXT:    vpslld $3, %xmm0, %xmm0
+; X86-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,0,63,0]
+; X86-NO-BMI2-AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; X86-NO-BMI2-AVX512-NEXT:    shrl $3, %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %esi, %k1
+; X86-NO-BMI2-AVX512-NEXT:    vpexpandq (%edx), %zmm3 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT:    vpsllq %xmm2, %zmm3, %zmm2
+; X86-NO-BMI2-AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; X86-NO-BMI2-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X86-NO-BMI2-AVX512-NEXT:    vpsrlq $1, %zmm1, %zmm1
+; X86-NO-BMI2-AVX512-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT:    popl %esi
+; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X86-NO-BMI2-AVX512-NEXT:    retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: shl_64bytes:
+; X86-HAVE-BMI2-AVX512:       # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT:    pushl %esi
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl (%edx), %esi
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-HAVE-BMI2-AVX512-NEXT:    vpslld $3, %xmm0, %xmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,0,63,0]
+; X86-HAVE-BMI2-AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; X86-HAVE-BMI2-AVX512-NEXT:    shrl $3, %esi
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %esi, %edx, %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %edx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpexpandq (%ecx), %zmm3 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsllq %xmm2, %zmm3, %zmm2
+; X86-HAVE-BMI2-AVX512-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm1 = zmm1[7],zmm3[0,1,2,3,4,5,6]
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsrlq $1, %zmm1, %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsrlq %xmm0, %zmm1, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vporq %zmm0, %zmm2, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT:    popl %esi
+; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
   %byteOff = load i512, ptr %byteOff.ptr, align 1
   %bitOff = shl i512 %byteOff, 3
@@ -17598,29 +16147,27 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 ; X64-AVX1-NEXT:    vzeroupper
 ; X64-AVX1-NEXT:    retq
 ;
-; X64-AVX512-LABEL: shl_64bytes_qwordOff:
-; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    pushq %rax
-; X64-AVX512-NEXT:    vmovups (%rdi), %zmm0
-; X64-AVX512-NEXT:    movl (%rsi), %eax
-; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X64-AVX512-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512-NEXT:    shll $3, %eax
-; X64-AVX512-NEXT:    andl $56, %eax
-; X64-AVX512-NEXT:    negl %eax
-; X64-AVX512-NEXT:    cltq
-; X64-AVX512-NEXT:    vmovups -64(%rsp,%rax), %xmm0
-; X64-AVX512-NEXT:    vmovups -48(%rsp,%rax), %xmm1
-; X64-AVX512-NEXT:    vmovups -32(%rsp,%rax), %xmm2
-; X64-AVX512-NEXT:    vmovups -16(%rsp,%rax), %xmm3
-; X64-AVX512-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX512-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX512-NEXT:    popq %rax
-; X64-AVX512-NEXT:    vzeroupper
-; X64-AVX512-NEXT:    retq
+; X64-NO-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X64-NO-BMI2-AVX512:       # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT:    movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT:    vpexpandq (%rdi), %zmm0 {%k1} {z}
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X64-NO-BMI2-AVX512-NEXT:    retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X64-HAVE-BMI2-AVX512:       # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpexpandq (%rdi), %zmm0 {%k1} {z}
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: shl_64bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
@@ -17821,33 +16368,37 @@ define void @shl_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) nou
 ; X86-AVX1-NEXT:    vzeroupper
 ; X86-AVX1-NEXT:    retl
 ;
-; X86-AVX512-LABEL: shl_64bytes_qwordOff:
-; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    subl $140, %esp
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT:    vmovups (%edx), %zmm0
-; X86-AVX512-NEXT:    movl (%ecx), %ecx
-; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; X86-AVX512-NEXT:    vmovups %zmm1, (%esp)
-; X86-AVX512-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; X86-AVX512-NEXT:    shll $3, %ecx
-; X86-AVX512-NEXT:    andl $56, %ecx
-; X86-AVX512-NEXT:    leal {{[0-9]+}}(%esp), %edx
-; X86-AVX512-NEXT:    subl %ecx, %edx
-; X86-AVX512-NEXT:    vmovups (%edx), %xmm0
-; X86-AVX512-NEXT:    vmovups 16(%edx), %xmm1
-; X86-AVX512-NEXT:    vmovups 32(%edx), %xmm2
-; X86-AVX512-NEXT:    negl %ecx
-; X86-AVX512-NEXT:    vmovups 112(%esp,%ecx), %xmm3
-; X86-AVX512-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm2, 32(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX512-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX512-NEXT:    addl $140, %esp
-; X86-AVX512-NEXT:    vzeroupper
-; X86-AVX512-NEXT:    retl
+; X86-NO-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X86-NO-BMI2-AVX512:       # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT:    pushl %esi
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %esi, %k1
+; X86-NO-BMI2-AVX512-NEXT:    vpexpandq (%edx), %zmm0 {%k1} {z}
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT:    popl %esi
+; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X86-NO-BMI2-AVX512-NEXT:    retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: shl_64bytes_qwordOff:
+; X86-HAVE-BMI2-AVX512:       # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT:    pushl %esi
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %esi
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpexpandq (%edx), %zmm0 {%k1} {z}
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT:    popl %esi
+; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
   %qwordOff = load i512, ptr %qwordOff.ptr, align 1
   %bitOff = shl i512 %qwordOff, 6
@@ -18501,321 +17052,378 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popq %r15
 ; X64-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retq
 ;
-; X64-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-NO-SHLD-NO-BMI2-AVX:       # %bb.0:
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    pushq %rax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 48(%rdi), %rax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq 56(%rdi), %rcx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl (%rsi), %edi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    sarq $63, %rcx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (,%rdi,8), %eax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andl $56, %eax
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    andl $56, %edi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -128(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -120(%rsp,%rdi), %r9
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %sil
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%r9,%r9), %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r10, %r8
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -104(%rsp,%rdi), %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -96(%rsp,%rdi), %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%r12,%r12), %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -112(%rsp,%rdi), %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rbx, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    addq %r10, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r14, %r10
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -88(%rsp,%rdi), %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r14, %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -80(%rsp,%rdi), %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%rbp,%rbp), %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r13, %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    addq %r14, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r12, %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq -72(%rsp,%rdi), %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    leaq (%rdi,%rdi), %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %rbp, %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shrq %cl, %r9
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    addq %rbx, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    shlq %cl, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    orq %r9, %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    sarq %cl, %rdi
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, 56(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %rbx, 8(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r12, 48(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r15, 40(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, 16(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r11, 24(%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, (%rdx)
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    addq $8, %rsp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbx
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %r12
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %r13
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %r14
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %r15
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    popq %rbp
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
-; X64-NO-SHLD-NO-BMI2-AVX-NEXT:    retq
-;
-; X64-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-HAVE-SHLD-NO-BMI2-AVX:       # %bb.0:
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 48(%rdi), %rcx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq 56(%rdi), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andl $56, %ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r9, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -112(%rsp,%rax), %r10
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r10, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r11, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -72(%rsp,%rax), %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq -120(%rsp,%rax), %rax
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rax, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    sarq %cl, %r11
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r11, 56(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rdi, 32(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %rsi, 24(%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movq %r14, (%rdx)
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %rbx
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %r14
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popq %r15
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
-; X64-HAVE-SHLD-NO-BMI2-AVX-NEXT:    retq
+; X64-NO-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-NO-SHLD-NO-BMI2-AVX1:       # %bb.0:
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 48(%rdi), %rax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq 56(%rdi), %rcx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%rsi), %edi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    sarq $63, %rcx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (,%rdi,8), %eax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %eax
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %edi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -128(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -120(%rsp,%rdi), %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %esi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    notb %sil
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq (%r9,%r9), %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r10, %r8
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -104(%rsp,%rdi), %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r10, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -96(%rsp,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq (%r12,%r12), %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %rbx, %r11
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rbx, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    addq %r10, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r14, %r10
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -88(%rsp,%rdi), %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r14, %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -80(%rsp,%rdi), %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq (%rbp,%rbp), %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r13, %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    addq %r14, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r12, %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    leaq (%rdi,%rdi), %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %rbp, %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrq %cl, %r9
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    addq %rbx, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    shlq %cl, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    orq %r9, %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    sarq %cl, %rdi
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, 56(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %rbx, 8(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r12, 48(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r14, 32(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r10, 16(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    movq %r8, (%rdx)
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    addq $8, %rsp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %rbx
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %r12
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %r13
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %r14
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %r15
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    popq %rbp
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    vzeroupper
+; X64-NO-SHLD-NO-BMI2-AVX1-NEXT:    retq
 ;
-; X64-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-NO-SHLD-HAVE-BMI2-AVX:       # %bb.0:
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 48(%rdi), %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl (%rsi), %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andl $56, %ecx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %esi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andl $56, %eax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %cl
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -120(%rsp,%rax), %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -112(%rsp,%rax), %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r10,%r10), %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %rdi, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r8, %rdi
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -104(%rsp,%rax), %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r11, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -96(%rsp,%rax), %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r14,%r14), %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r8, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rbx, %r8
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r9, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addq %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r11, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %rbx, %r11
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -88(%rsp,%rax), %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %rbx, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -80(%rsp,%rax), %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%r12,%r12), %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r13, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r15, %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r14, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addq %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %rbx, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r14, %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r12, %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leaq (%rax,%rax), %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r15, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r14, %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxq %rsi, %r10, %r10
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addq %r9, %r9
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxq %rcx, %r9, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orq %r10, %rcx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarxq %rsi, %rax, %rax
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, 56(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, 8(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r15, 48(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbx, 32(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r13, 40(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r8, 24(%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, (%rdx)
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbx
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r12
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r13
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r14
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r15
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
-; X64-NO-SHLD-HAVE-BMI2-AVX-NEXT:    retq
+; X64-HAVE-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-NO-BMI2-AVX1:       # %bb.0:
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 48(%rdi), %rcx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%rsi), %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    sarq $63, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $56, %eax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r9, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r10, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r11, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rax, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    sarq %cl, %r11
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r11, 56(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movq %r14, (%rdx)
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %rbx
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %r14
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popq %r15
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vzeroupper
+; X64-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    retq
 ;
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX:       # %bb.0:
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 48(%rdi), %rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq 56(%rdi), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl (%rsi), %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarq $63, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%rax,8), %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andl $56, %ecx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andl $56, %eax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -96(%rsp,%rax), %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r9, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rdi, %rsi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -112(%rsp,%rax), %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r10, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %r9, %r8
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -80(%rsp,%rax), %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r11, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %r9, %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -72(%rsp,%rax), %r11
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %r11, %r9
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -128(%rsp,%rax), %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq -120(%rsp,%rax), %rax
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rax, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %r10, %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarxq %rcx, %r11, %r10
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdq %cl, %rax, %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r15, 8(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r9, 48(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rdi, 32(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rbx, 40(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r8, 16(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %rsi, 24(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r14, (%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movq %r10, 56(%rdx)
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %rbx
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r14
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popq %r15
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
-; X64-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    retq
+; X64-NO-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-NO-SHLD-HAVE-BMI2-AVX1:       # %bb.0:
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 48(%rdi), %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 56(%rdi), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%rsi), %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    sarq $63, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%rax,8), %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %ecx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %esi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %eax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    notb %cl
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -120(%rsp,%rax), %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -112(%rsp,%rax), %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%r10,%r10), %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %rdi, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r8, %rdi
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rsi, %r11, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -96(%rsp,%rax), %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%r14,%r14), %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r8, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %rbx, %r8
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rsi, %r9, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addq %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r11, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %rbx, %r11
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -88(%rsp,%rax), %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rsi, %rbx, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -80(%rsp,%rax), %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%r12,%r12), %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r13, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r15, %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rsi, %r14, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addq %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %rbx, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r14, %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rsi, %r12, %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -72(%rsp,%rax), %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leaq (%rax,%rax), %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r15, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r14, %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxq %rsi, %r10, %r10
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addq %r9, %r9
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxq %rcx, %r9, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orq %r10, %rcx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    sarxq %rsi, %rax, %rax
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rax, 56(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rcx, 8(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r15, 48(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rbx, 32(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r13, 40(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r11, 16(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r8, 24(%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, (%rdx)
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %rbx
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r12
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r13
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r14
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r15
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
+; X64-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    retq
 ;
-; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
-; X86-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebp
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
-; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1:       # %bb.0:
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 48(%rdi), %rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq 56(%rdi), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%rsi), %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    sarq $63, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%rax,8), %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %ecx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $56, %eax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -96(%rsp,%rax), %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -104(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r9, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %rdi, %rsi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -112(%rsp,%rax), %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r10, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %r9, %r8
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -80(%rsp,%rax), %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -88(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r11, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %r9, %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %r11, %rdi
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -72(%rsp,%rax), %r11
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %r11, %r9
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -128(%rsp,%rax), %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq -120(%rsp,%rax), %rax
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rax, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %r10, %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    sarxq %rcx, %r11, %r10
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdq %cl, %rax, %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r15, 8(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r9, 48(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rdi, 32(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rbx, 40(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r8, 16(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %rsi, 24(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r14, (%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movq %r10, 56(%rdx)
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %rbx
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r14
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popq %r15
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
+; X64-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    retq
+;
+; X64-NO-BMI2-AVX512-LABEL: ashr_64bytes:
+; X64-NO-BMI2-AVX512:       # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    movl (%rsi), %eax
+; X64-NO-BMI2-AVX512-NEXT:    vpsraq $63, %zmm0, %zmm1
+; X64-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-NO-BMI2-AVX512-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; X64-NO-BMI2-AVX512-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-AVX512-NEXT:    shrl $3, %ecx
+; X64-NO-BMI2-AVX512-NEXT:    movl $-1, %esi
+; X64-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
+; X64-NO-BMI2-AVX512-NEXT:    kmovw %esi, %k1
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2
+; X64-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm2 {%k1}
+; X64-NO-BMI2-AVX512-NEXT:    shll $3, %eax
+; X64-NO-BMI2-AVX512-NEXT:    vpbroadcastq %rax, %xmm0
+; X64-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; X64-NO-BMI2-AVX512-NEXT:    vpand %xmm3, %xmm0, %xmm4
+; X64-NO-BMI2-AVX512-NEXT:    vpsrlq %xmm4, %zmm2, %zmm4
+; X64-NO-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; X64-NO-BMI2-AVX512-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; X64-NO-BMI2-AVX512-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; X64-NO-BMI2-AVX512-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X64-NO-BMI2-AVX512-NEXT:    retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: ashr_64bytes:
+; X64-HAVE-BMI2-AVX512:       # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    vpsraq $63, %zmm0, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-HAVE-BMI2-AVX512-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT:    shrl $3, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT:    movl $-1, %esi
+; X64-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %esi, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT:    kmovw %ecx, %k1
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqa64 %zmm1, %zmm2
+; X64-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm2 {%k1}
+; X64-HAVE-BMI2-AVX512-NEXT:    shll $3, %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq %rax, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; X64-HAVE-BMI2-AVX512-NEXT:    vpand %xmm3, %xmm0, %xmm4
+; X64-HAVE-BMI2-AVX512-NEXT:    vpsrlq %xmm4, %zmm2, %zmm4
+; X64-HAVE-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; X64-HAVE-BMI2-AVX512-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpandn %xmm3, %xmm0, %xmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vpsllq %xmm0, %zmm1, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT:    retq
+;
+; X86-NO-SHLD-NO-BMI2-SSE2-LABEL: ashr_64bytes:
+; X86-NO-SHLD-NO-BMI2-SSE2:       # %bb.0:
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebp
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %ebx
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %edi
+; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    pushl %esi
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    subl $204, %esp
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-SHLD-NO-BMI2-SSE2-NEXT:    movl (%ecx), %eax
@@ -20304,668 +18912,733 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    popl %ebp
 ; X86-HAVE-SHLD-HAVE-BMI2-SSE4-NEXT:    retl
 ;
-; X86-NO-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-NO-SHLD-NO-BMI2-AVX:       # %bb.0:
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    subl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups 32(%ecx), %xmm1
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 48(%ecx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 52(%ecx), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 56(%ecx), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 60(%ecx), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    andl $60, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 68(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll $3, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    andl $24, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 72(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %al, %ch
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    notb %ch
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 64(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 76(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 80(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%edi,%edi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %edx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 84(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 88(%esp,%esi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 92(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 96(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%edi,%edi), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %eax, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 100(%esp,%esi), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 104(%esp,%esi), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%edx,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %ebx, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 108(%esp,%esi), %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 112(%esp,%esi), %ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%ecx,%ecx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebp, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %edx, %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 116(%esp,%esi), %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %al, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 120(%esp,%edx), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %ebx, %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %dl, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shrl %cl, %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl 124(%esp,%edx), %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    leal (%ebx,%ebx), %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movb %ch, %cl
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    shll %cl, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    orl %eax, %edx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    sarl %cl, %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, 60(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 48(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ebp, 52(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 40(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    addl $204, %esp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %ebx
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    popl %ebp
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
-; X86-NO-SHLD-NO-BMI2-AVX-NEXT:    retl
-;
-; X86-HAVE-SHLD-NO-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-HAVE-SHLD-NO-BMI2-AVX:       # %bb.0:
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    pushl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    subl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups (%eax), %ymm0
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups 32(%eax), %xmm1
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 48(%eax), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 56(%eax), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl (%ecx), %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    sarl $31, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shll $3, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 88(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 100(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 48(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl 108(%esp,%ebp), %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edx, 56(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    sarl %cl, %eax
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %edi, 52(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 40(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %ebx, (%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    movl %eax, 4(%ebp)
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    addl $188, %esp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %esi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %edi
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %ebx
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    popl %ebp
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    vzeroupper
-; X86-HAVE-SHLD-NO-BMI2-AVX-NEXT:    retl
-;
-; X86-NO-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-NO-SHLD-HAVE-BMI2-AVX:       # %bb.0:
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    subl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%ecx), %ymm0
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups 32(%ecx), %xmm1
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 48(%ecx), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 52(%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 56(%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 60(%ecx), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl (%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarl $31, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (,%eax,8), %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andl $24, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    andl $60, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 68(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 72(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    notb %dl
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%edi,%edi), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 80(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 76(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 88(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 84(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 96(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%esi,%esi), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 92(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 104(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 100(%esp,%ecx), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %edi, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 112(%esp,%ecx), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%eax,%eax), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 108(%esp,%ecx), %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %esi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %esi, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %esi, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ecx, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 120(%esp,%ebp), %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%edi,%edi), %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %ecx, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 116(%esp,%ebp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %eax, %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl %eax, %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %eax, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %ebp, %ecx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl 124(%esp,%eax), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shlxl %edx, %ebp, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    shrxl %ebx, %edi, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    orl %edi, %edx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    sarxl %ebx, %eax, %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, 60(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 56(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 48(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 52(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 40(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 44(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 32(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 36(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 24(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 28(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 16(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 20(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 8(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 12(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, (%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 4(%eax)
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    addl $204, %esp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %ebx
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    popl %ebp
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
-; X86-NO-SHLD-HAVE-BMI2-AVX-NEXT:    retl
+; X86-NO-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-NO-SHLD-NO-BMI2-AVX1:       # %bb.0:
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    pushl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    subl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %xmm1
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%ecx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%ecx), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%ecx), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%ecx), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl (%eax), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    sarl $31, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 68(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll $3, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    andl $24, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 72(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %ch
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    notb %ch
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 64(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 76(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 80(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edi,%edi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %edx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 84(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 88(%esp,%esi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %eax, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 92(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 96(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 100(%esp,%esi), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 104(%esp,%esi), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%edx,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %ebx, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 108(%esp,%esi), %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 112(%esp,%esi), %ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%ecx,%ecx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebp, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %edx, %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 116(%esp,%esi), %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %al, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 120(%esp,%edx), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %ebx, %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %dl, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shrl %cl, %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl 124(%esp,%edx), %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    leal (%ebx,%ebx), %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movb %ch, %cl
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    shll %cl, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    orl %eax, %edx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    sarl %cl, %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, 60(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 56(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 48(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebp, 52(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 40(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 44(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 36(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 28(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 20(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 12(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, (%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    addl $204, %esp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %esi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %edi
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %ebx
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    popl %ebp
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    vzeroupper
+; X86-NO-SHLD-NO-BMI2-AVX1-NEXT:    retl
+;
+; X86-HAVE-SHLD-NO-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-NO-BMI2-AVX1:       # %bb.0:
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    pushl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups 32(%eax), %xmm1
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%eax), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%eax), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%eax), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%eax), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    sarl $31, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ecx, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $60, %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shll $3, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    andl $24, %ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 88(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 104(%esp,%ebp), %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 100(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 48(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl 108(%esp,%ebp), %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edx, 56(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    sarl %cl, %eax
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %edi, 52(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 40(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %ebx, (%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    movl %eax, 4(%ebp)
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %esi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %edi
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %ebx
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    popl %ebp
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    vzeroupper
+; X86-HAVE-SHLD-NO-BMI2-AVX1-NEXT:    retl
+;
+; X86-NO-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-NO-SHLD-HAVE-BMI2-AVX1:       # %bb.0:
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%ecx), %ymm0
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%ecx), %xmm1
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%ecx), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%ecx), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    sarl $31, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (,%eax,8), %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $24, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 68(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 72(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    notb %dl
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 80(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 76(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 84(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 96(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%esi,%esi), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 92(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 104(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 100(%esp,%ecx), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 112(%esp,%ecx), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 108(%esp,%ecx), %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %esi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %esi, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %esi, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ecx, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 120(%esp,%ebp), %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%edi,%edi), %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ecx, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 116(%esp,%ebp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %eax, %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl %eax, %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %eax, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %ebp, %ecx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 124(%esp,%eax), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shlxl %edx, %ebp, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    shrxl %ebx, %edi, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    orl %edi, %edx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    sarxl %ebx, %eax, %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, 60(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, 56(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 48(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 52(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 40(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 44(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 36(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 28(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 20(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 12(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, (%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 4(%eax)
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $204, %esp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %esi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %edi
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %ebx
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %ebp
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
+; X86-NO-SHLD-HAVE-BMI2-AVX1-NEXT:    retl
 ;
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-LABEL: ashr_64bytes:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX:       # %bb.0:
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    pushl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    subl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups (%eax), %ymm0
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups 32(%eax), %xmm1
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 48(%eax), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 56(%eax), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 60(%eax), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl (%ecx), %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarl $31, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andl $60, %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 56(%esp,%ebp), %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 52(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shll $3, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    andl $24, %ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edx, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 64(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 60(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 72(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 80(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 76(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 88(%esp,%ebp), %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 84(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %ebx, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 96(%esp,%ebp), %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 92(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %esi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 104(%esp,%ebp), %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 48(%esp,%ebp), %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl 108(%esp,%ebp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebp, (%esp) # 4-byte Spill
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %ebp, %eax
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 56(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %esi, 48(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edx, 52(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ebx, 40(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 44(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 32(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 36(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 24(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 28(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 16(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 20(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 8(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 12(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %edi, (%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %ecx, 4(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    movl %eax, 60(%ebp)
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    addl $188, %esp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %esi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %edi
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %ebx
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    popl %ebp
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    vzeroupper
-; X86-HAVE-SHLD-HAVE-BMI2-AVX-NEXT:    retl
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-LABEL: ashr_64bytes:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1:       # %bb.0:
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    pushl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    subl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups (%eax), %ymm0
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups 32(%eax), %xmm1
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%eax), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%eax), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%eax), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%eax), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl (%ecx), %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    sarl $31, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $60, %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 56(%esp,%ebp), %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 52(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shll $3, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    andl $24, %ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 64(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 60(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 72(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 68(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 80(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 76(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 88(%esp,%ebp), %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 84(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebx, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 96(%esp,%ebp), %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 92(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 104(%esp,%ebp), %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 100(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edi, %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 48(%esp,%ebp), %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl 108(%esp,%ebp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %ebp, %eax
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 56(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %esi, 48(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edx, 52(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ebx, 40(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 44(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 32(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 36(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 24(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 28(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 16(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 20(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 8(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 12(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    sarxl %ecx, (%esp), %eax # 4-byte Folded Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    shrdl %cl, %edx, %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %edi, (%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %ecx, 4(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    movl %eax, 60(%ebp)
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    addl $188, %esp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %esi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %edi
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %ebx
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    popl %ebp
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    vzeroupper
+; X86-HAVE-SHLD-HAVE-BMI2-AVX1-NEXT:    retl
+;
+; X86-NO-BMI2-AVX512-LABEL: ashr_64bytes:
+; X86-NO-BMI2-AVX512:       # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT:    pushl %esi
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%ecx), %zmm1
+; X86-NO-BMI2-AVX512-NEXT:    movl (%edx), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    vpsraq $63, %zmm1, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-NO-BMI2-AVX512-NEXT:    vpermq %zmm0, %zmm2, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    shrl $3, %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %esi
+; X86-NO-BMI2-AVX512-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %esi, %k1
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqa64 %zmm0, %zmm2
+; X86-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm1, %zmm2 {%k1}
+; X86-NO-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NO-BMI2-AVX512-NEXT:    vpslld $3, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,0,63,0]
+; X86-NO-BMI2-AVX512-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; X86-NO-BMI2-AVX512-NEXT:    vpsrlq %xmm4, %zmm2, %zmm4
+; X86-NO-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm0 = zmm2[1,2,3,4,5,6,7],zmm0[0]
+; X86-NO-BMI2-AVX512-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; X86-NO-BMI2-AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
+; X86-NO-BMI2-AVX512-NEXT:    popl %esi
+; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X86-NO-BMI2-AVX512-NEXT:    retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: ashr_64bytes:
+; X86-HAVE-BMI2-AVX512:       # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT:    pushl %esi
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    movl (%ecx), %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsraq $63, %zmm1, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-HAVE-BMI2-AVX512-NEXT:    vpermq %zmm0, %zmm2, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    shrl $3, %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %esi
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %edx, %esi, %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %edx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqa64 %zmm0, %zmm2
+; X86-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm1, %zmm2 {%k1}
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-HAVE-BMI2-AVX512-NEXT:    vpslld $3, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,0,63,0]
+; X86-HAVE-BMI2-AVX512-NEXT:    vpand %xmm3, %xmm1, %xmm4
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsrlq %xmm4, %zmm2, %zmm4
+; X86-HAVE-BMI2-AVX512-NEXT:    valignq {{.*#+}} zmm0 = zmm2[1,2,3,4,5,6,7],zmm0[0]
+; X86-HAVE-BMI2-AVX512-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vpandn %xmm3, %xmm1, %xmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm0, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT:    popl %esi
+; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
   %byteOff = load i512, ptr %byteOff.ptr, align 1
   %bitOff = shl i512 %byteOff, 3
@@ -21059,39 +19732,69 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ; X64-SSE42-NEXT:    popq %rax
 ; X64-SSE42-NEXT:    retq
 ;
-; X64-AVX-LABEL: ashr_64bytes_qwordOff:
-; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    pushq %rax
-; X64-AVX-NEXT:    vmovups (%rdi), %ymm0
-; X64-AVX-NEXT:    vmovups 32(%rdi), %xmm1
-; X64-AVX-NEXT:    movq 48(%rdi), %rax
-; X64-AVX-NEXT:    movq 56(%rdi), %rcx
-; X64-AVX-NEXT:    movl (%rsi), %esi
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    sarq $63, %rcx
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-AVX-NEXT:    andl $7, %esi
-; X64-AVX-NEXT:    vmovups -128(%rsp,%rsi,8), %xmm0
-; X64-AVX-NEXT:    vmovups -112(%rsp,%rsi,8), %xmm1
-; X64-AVX-NEXT:    vmovups -96(%rsp,%rsi,8), %xmm2
-; X64-AVX-NEXT:    vmovups -80(%rsp,%rsi,8), %xmm3
-; X64-AVX-NEXT:    vmovups %xmm3, 48(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm1, 16(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm2, 32(%rdx)
-; X64-AVX-NEXT:    vmovups %xmm0, (%rdx)
-; X64-AVX-NEXT:    popq %rax
-; X64-AVX-NEXT:    vzeroupper
-; X64-AVX-NEXT:    retq
+; X64-AVX1-LABEL: ashr_64bytes_qwordOff:
+; X64-AVX1:       # %bb.0:
+; X64-AVX1-NEXT:    pushq %rax
+; X64-AVX1-NEXT:    vmovups (%rdi), %ymm0
+; X64-AVX1-NEXT:    vmovups 32(%rdi), %xmm1
+; X64-AVX1-NEXT:    movq 48(%rdi), %rax
+; X64-AVX1-NEXT:    movq 56(%rdi), %rcx
+; X64-AVX1-NEXT:    movl (%rsi), %esi
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    sarq $63, %rcx
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-AVX1-NEXT:    andl $7, %esi
+; X64-AVX1-NEXT:    vmovups -128(%rsp,%rsi,8), %xmm0
+; X64-AVX1-NEXT:    vmovups -112(%rsp,%rsi,8), %xmm1
+; X64-AVX1-NEXT:    vmovups -96(%rsp,%rsi,8), %xmm2
+; X64-AVX1-NEXT:    vmovups -80(%rsp,%rsi,8), %xmm3
+; X64-AVX1-NEXT:    vmovups %xmm3, 48(%rdx)
+; X64-AVX1-NEXT:    vmovups %xmm1, 16(%rdx)
+; X64-AVX1-NEXT:    vmovups %xmm2, 32(%rdx)
+; X64-AVX1-NEXT:    vmovups %xmm0, (%rdx)
+; X64-AVX1-NEXT:    popq %rax
+; X64-AVX1-NEXT:    vzeroupper
+; X64-AVX1-NEXT:    retq
+;
+; X64-NO-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X64-NO-BMI2-AVX512:       # %bb.0:
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-NO-BMI2-AVX512-NEXT:    movzbl (%rsi), %ecx
+; X64-NO-BMI2-AVX512-NEXT:    vpsraq $63, %zmm0, %zmm1
+; X64-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-NO-BMI2-AVX512-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; X64-NO-BMI2-AVX512-NEXT:    movl $-1, %eax
+; X64-NO-BMI2-AVX512-NEXT:    shll %cl, %eax
+; X64-NO-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X64-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; X64-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm1, (%rdx)
+; X64-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X64-NO-BMI2-AVX512-NEXT:    retq
+;
+; X64-HAVE-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X64-HAVE-BMI2-AVX512:       # %bb.0:
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%rdi), %zmm0
+; X64-HAVE-BMI2-AVX512-NEXT:    movzbl (%rsi), %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    vpsraq $63, %zmm0, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,7,7,7,7,7,7,7]
+; X64-HAVE-BMI2-AVX512-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; X64-HAVE-BMI2-AVX512-NEXT:    movl $-1, %ecx
+; X64-HAVE-BMI2-AVX512-NEXT:    shlxl %eax, %ecx, %eax
+; X64-HAVE-BMI2-AVX512-NEXT:    kmovw %eax, %k1
+; X64-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; X64-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm1, (%rdx)
+; X64-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X64-HAVE-BMI2-AVX512-NEXT:    retq
 ;
 ; X86-SSE2-LABEL: ashr_64bytes_qwordOff:
 ; X86-SSE2:       # %bb.0:
@@ -21289,60 +19992,96 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ; X86-SSE42-NEXT:    popl %ebx
 ; X86-SSE42-NEXT:    retl
 ;
-; X86-AVX-LABEL: ashr_64bytes_qwordOff:
-; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    pushl %ebx
-; X86-AVX-NEXT:    pushl %edi
-; X86-AVX-NEXT:    pushl %esi
-; X86-AVX-NEXT:    subl $128, %esp
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-AVX-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-AVX-NEXT:    vmovups (%edx), %ymm0
-; X86-AVX-NEXT:    vmovups 32(%edx), %xmm1
-; X86-AVX-NEXT:    movl 48(%edx), %esi
-; X86-AVX-NEXT:    movl 52(%edx), %edi
-; X86-AVX-NEXT:    movl 56(%edx), %ebx
-; X86-AVX-NEXT:    movl 60(%edx), %edx
-; X86-AVX-NEXT:    movl (%ecx), %ecx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    vmovups %ymm0, (%esp)
-; X86-AVX-NEXT:    sarl $31, %edx
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-AVX-NEXT:    andl $7, %ecx
-; X86-AVX-NEXT:    vmovups (%esp,%ecx,8), %xmm0
-; X86-AVX-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
-; X86-AVX-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
-; X86-AVX-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
-; X86-AVX-NEXT:    vmovups %xmm3, 48(%eax)
-; X86-AVX-NEXT:    vmovups %xmm2, 32(%eax)
-; X86-AVX-NEXT:    vmovups %xmm1, 16(%eax)
-; X86-AVX-NEXT:    vmovups %xmm0, (%eax)
-; X86-AVX-NEXT:    addl $128, %esp
-; X86-AVX-NEXT:    popl %esi
-; X86-AVX-NEXT:    popl %edi
-; X86-AVX-NEXT:    popl %ebx
-; X86-AVX-NEXT:    vzeroupper
-; X86-AVX-NEXT:    retl
+; X86-AVX1-LABEL: ashr_64bytes_qwordOff:
+; X86-AVX1:       # %bb.0:
+; X86-AVX1-NEXT:    pushl %ebx
+; X86-AVX1-NEXT:    pushl %edi
+; X86-AVX1-NEXT:    pushl %esi
+; X86-AVX1-NEXT:    subl $128, %esp
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-AVX1-NEXT:    vmovups (%edx), %ymm0
+; X86-AVX1-NEXT:    vmovups 32(%edx), %xmm1
+; X86-AVX1-NEXT:    movl 48(%edx), %esi
+; X86-AVX1-NEXT:    movl 52(%edx), %edi
+; X86-AVX1-NEXT:    movl 56(%edx), %ebx
+; X86-AVX1-NEXT:    movl 60(%edx), %edx
+; X86-AVX1-NEXT:    movl (%ecx), %ecx
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    vmovups %ymm0, (%esp)
+; X86-AVX1-NEXT:    sarl $31, %edx
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-AVX1-NEXT:    andl $7, %ecx
+; X86-AVX1-NEXT:    vmovups (%esp,%ecx,8), %xmm0
+; X86-AVX1-NEXT:    vmovups 16(%esp,%ecx,8), %xmm1
+; X86-AVX1-NEXT:    vmovups 32(%esp,%ecx,8), %xmm2
+; X86-AVX1-NEXT:    vmovups 48(%esp,%ecx,8), %xmm3
+; X86-AVX1-NEXT:    vmovups %xmm3, 48(%eax)
+; X86-AVX1-NEXT:    vmovups %xmm2, 32(%eax)
+; X86-AVX1-NEXT:    vmovups %xmm1, 16(%eax)
+; X86-AVX1-NEXT:    vmovups %xmm0, (%eax)
+; X86-AVX1-NEXT:    addl $128, %esp
+; X86-AVX1-NEXT:    popl %esi
+; X86-AVX1-NEXT:    popl %edi
+; X86-AVX1-NEXT:    popl %ebx
+; X86-AVX1-NEXT:    vzeroupper
+; X86-AVX1-NEXT:    retl
+;
+; X86-NO-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X86-NO-BMI2-AVX512:       # %bb.0:
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm0
+; X86-NO-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
+; X86-NO-BMI2-AVX512-NEXT:    vpsraq $63, %zmm0, %zmm1
+; X86-NO-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-NO-BMI2-AVX512-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; X86-NO-BMI2-AVX512-NEXT:    movl $-1, %edx
+; X86-NO-BMI2-AVX512-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-AVX512-NEXT:    kmovw %edx, %k1
+; X86-NO-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; X86-NO-BMI2-AVX512-NEXT:    vmovdqu64 %zmm1, (%eax)
+; X86-NO-BMI2-AVX512-NEXT:    vzeroupper
+; X86-NO-BMI2-AVX512-NEXT:    retl
+;
+; X86-HAVE-BMI2-AVX512-LABEL: ashr_64bytes_qwordOff:
+; X86-HAVE-BMI2-AVX512:       # %bb.0:
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 (%edx), %zmm0
+; X86-HAVE-BMI2-AVX512-NEXT:    movzbl (%ecx), %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    vpsraq $63, %zmm0, %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpbroadcastq {{.*#+}} zmm2 = [7,0,7,0,7,0,7,0,7,0,7,0,7,0,7,0]
+; X86-HAVE-BMI2-AVX512-NEXT:    vpermq %zmm1, %zmm2, %zmm1
+; X86-HAVE-BMI2-AVX512-NEXT:    movl $-1, %edx
+; X86-HAVE-BMI2-AVX512-NEXT:    shlxl %ecx, %edx, %ecx
+; X86-HAVE-BMI2-AVX512-NEXT:    kmovw %ecx, %k1
+; X86-HAVE-BMI2-AVX512-NEXT:    vpcompressq %zmm0, %zmm1 {%k1}
+; X86-HAVE-BMI2-AVX512-NEXT:    vmovdqu64 %zmm1, (%eax)
+; X86-HAVE-BMI2-AVX512-NEXT:    vzeroupper
+; X86-HAVE-BMI2-AVX512-NEXT:    retl
   %src = load i512, ptr %src.ptr, align 1
   %qwordOff = load i512, ptr %qwordOff.ptr, align 1
   %bitOff = shl i512 %qwordOff, 6
@@ -21354,4 +20093,14 @@ define void @ashr_64bytes_qwordOff(ptr %src.ptr, ptr %qwordOff.ptr, ptr %dst) no
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
 ; X64: {{.*}}
+; X64-AVX512: {{.*}}
+; X64-HAVE-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X64-HAVE-SHLD-NO-BMI2-AVX512: {{.*}}
+; X64-NO-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X64-NO-SHLD-NO-BMI2-AVX512: {{.*}}
 ; X86: {{.*}}
+; X86-AVX512: {{.*}}
+; X86-HAVE-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X86-HAVE-SHLD-NO-BMI2-AVX512: {{.*}}
+; X86-NO-SHLD-HAVE-BMI2-AVX512: {{.*}}
+; X86-NO-SHLD-NO-BMI2-AVX512: {{.*}}



More information about the llvm-commits mailing list