[llvm] [X86] shift-i512.ll - extend test coverage (PR #171125)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Mon Dec 8 05:46:21 PST 2025


https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/171125

Remove v8i64 dependency from original shift-by-1 tests - this was added for #132601 but is unlikely to be necessary

Add tests for general shifts as well as shift-by-constant and shift-of-constant examples

>From 31708230fc7e921db5baac047e20313276c1bb40 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Mon, 8 Dec 2025 13:45:31 +0000
Subject: [PATCH] [X86] shift-i512.ll - extend test coverage

Remove v8i64 dependency - this was added for #132601 but is unlikely to be necessary

Add tests for general shifts as well as shift-by-constant and shift-of-constant examples
---
 llvm/test/CodeGen/X86/shift-i512.ll | 2206 ++++++++++++++++++++++++---
 1 file changed, 2024 insertions(+), 182 deletions(-)

diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index 03b61d9235254..4d341f1b31027 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -1,208 +1,2050 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s -check-prefixes=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+avx512vbmi2 | FileCheck %s -check-prefixes=AVX512VBMI
-; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s -check-prefixes=ZNVER4
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s -check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s -check-prefixes=CHECK,SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s -check-prefixes=CHECK,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=knl | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512VL
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 -mattr=+avx512vbmi2 | FileCheck %s -check-prefixes=CHECK,AVX512,AVX512VBMI
 
-; i512 shifts hidden inside 512-bit vectors.
+define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
+; SSE-LABEL: shl_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %eax
+; SSE-NEXT:    andl $56, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    cltq
+; SSE-NEXT:    movq -56(%rsp,%rax), %rdx
+; SSE-NEXT:    movq -48(%rsp,%rax), %r9
+; SSE-NEXT:    movq %r9, %rsi
+; SSE-NEXT:    shldq %cl, %rdx, %rsi
+; SSE-NEXT:    movq -40(%rsp,%rax), %r10
+; SSE-NEXT:    movq %r10, %r8
+; SSE-NEXT:    shldq %cl, %r9, %r8
+; SSE-NEXT:    movq -32(%rsp,%rax), %r9
+; SSE-NEXT:    movq %r9, %r11
+; SSE-NEXT:    shldq %cl, %r10, %r11
+; SSE-NEXT:    movq -24(%rsp,%rax), %r10
+; SSE-NEXT:    movq %r10, %rbx
+; SSE-NEXT:    shldq %cl, %r9, %rbx
+; SSE-NEXT:    movq -16(%rsp,%rax), %r9
+; SSE-NEXT:    movq %r9, %r14
+; SSE-NEXT:    shldq %cl, %r10, %r14
+; SSE-NEXT:    movq -8(%rsp,%rax), %r10
+; SSE-NEXT:    shldq %cl, %r9, %r10
+; SSE-NEXT:    movq -64(%rsp,%rax), %rax
+; SSE-NEXT:    movq %rax, %r9
+; SSE-NEXT:    shlq %cl, %r9
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shldq %cl, %rax, %rdx
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq %r10, 56(%rdi)
+; SSE-NEXT:    movq %r14, 48(%rdi)
+; SSE-NEXT:    movq %rbx, 40(%rdi)
+; SSE-NEXT:    movq %r11, 32(%rdi)
+; SSE-NEXT:    movq %r8, 24(%rdi)
+; SSE-NEXT:    movq %rsi, 16(%rdi)
+; SSE-NEXT:    movq %rdx, 8(%rdi)
+; SSE-NEXT:    movq %r9, (%rdi)
+; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: shl_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %eax
+; AVX2-NEXT:    andl $56, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    movslq %eax, %r8
+; AVX2-NEXT:    movq -56(%rsp,%r8), %rdx
+; AVX2-NEXT:    movq -48(%rsp,%r8), %rax
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    shldq %cl, %rdx, %rsi
+; AVX2-NEXT:    movq -40(%rsp,%r8), %r10
+; AVX2-NEXT:    movq %r10, %r9
+; AVX2-NEXT:    shldq %cl, %rax, %r9
+; AVX2-NEXT:    movq -32(%rsp,%r8), %rax
+; AVX2-NEXT:    movq %rax, %r11
+; AVX2-NEXT:    shldq %cl, %r10, %r11
+; AVX2-NEXT:    movq -24(%rsp,%r8), %r10
+; AVX2-NEXT:    movq %r10, %rbx
+; AVX2-NEXT:    shldq %cl, %rax, %rbx
+; AVX2-NEXT:    movq -16(%rsp,%r8), %rax
+; AVX2-NEXT:    movq %rax, %r14
+; AVX2-NEXT:    shldq %cl, %r10, %r14
+; AVX2-NEXT:    movq -8(%rsp,%r8), %r10
+; AVX2-NEXT:    shldq %cl, %rax, %r10
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movq -64(%rsp,%r8), %rdi
+; AVX2-NEXT:    shlxq %rcx, %rdi, %r8
+; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT:    shldq %cl, %rdi, %rdx
+; AVX2-NEXT:    movq %r10, 56(%rax)
+; AVX2-NEXT:    movq %r14, 48(%rax)
+; AVX2-NEXT:    movq %rbx, 40(%rax)
+; AVX2-NEXT:    movq %r11, 32(%rax)
+; AVX2-NEXT:    movq %r9, 24(%rax)
+; AVX2-NEXT:    movq %rsi, 16(%rax)
+; AVX2-NEXT:    movq %rdx, 8(%rax)
+; AVX2-NEXT:    movq %r8, (%rax)
+; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: shl_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %r14
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movl %eax, %ecx
+; AVX512F-NEXT:    andl $63, %ecx
+; AVX512F-NEXT:    shrl $3, %eax
+; AVX512F-NEXT:    andl $56, %eax
+; AVX512F-NEXT:    negl %eax
+; AVX512F-NEXT:    movslq %eax, %r8
+; AVX512F-NEXT:    movq -56(%rsp,%r8), %rdx
+; AVX512F-NEXT:    movq -48(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq %rax, %rsi
+; AVX512F-NEXT:    shldq %cl, %rdx, %rsi
+; AVX512F-NEXT:    movq -40(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq %r10, %r9
+; AVX512F-NEXT:    shldq %cl, %rax, %r9
+; AVX512F-NEXT:    movq -32(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq %rax, %r11
+; AVX512F-NEXT:    shldq %cl, %r10, %r11
+; AVX512F-NEXT:    movq -24(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq %r10, %rbx
+; AVX512F-NEXT:    shldq %cl, %rax, %rbx
+; AVX512F-NEXT:    movq -16(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq %rax, %r14
+; AVX512F-NEXT:    shldq %cl, %r10, %r14
+; AVX512F-NEXT:    movq -8(%rsp,%r8), %r10
+; AVX512F-NEXT:    shldq %cl, %rax, %r10
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movq -64(%rsp,%r8), %rdi
+; AVX512F-NEXT:    shlxq %rcx, %rdi, %r8
+; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512F-NEXT:    shldq %cl, %rdi, %rdx
+; AVX512F-NEXT:    movq %r10, 56(%rax)
+; AVX512F-NEXT:    movq %r14, 48(%rax)
+; AVX512F-NEXT:    movq %rbx, 40(%rax)
+; AVX512F-NEXT:    movq %r11, 32(%rax)
+; AVX512F-NEXT:    movq %r9, 24(%rax)
+; AVX512F-NEXT:    movq %rsi, 16(%rax)
+; AVX512F-NEXT:    movq %rdx, 8(%rax)
+; AVX512F-NEXT:    movq %r8, (%rax)
+; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    popq %rbx
+; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shl_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %r15
+; AVX512VL-NEXT:    pushq %r14
+; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    andl $63, %ecx
+; AVX512VL-NEXT:    shrl $3, %eax
+; AVX512VL-NEXT:    andl $56, %eax
+; AVX512VL-NEXT:    negl %eax
+; AVX512VL-NEXT:    movslq %eax, %r9
+; AVX512VL-NEXT:    movq -56(%rsp,%r9), %rdx
+; AVX512VL-NEXT:    movq -48(%rsp,%r9), %rax
+; AVX512VL-NEXT:    movq %rax, %rsi
+; AVX512VL-NEXT:    shldq %cl, %rdx, %rsi
+; AVX512VL-NEXT:    movq -40(%rsp,%r9), %r10
+; AVX512VL-NEXT:    movq %r10, %r8
+; AVX512VL-NEXT:    shldq %cl, %rax, %r8
+; AVX512VL-NEXT:    movq -32(%rsp,%r9), %r11
+; AVX512VL-NEXT:    movq %r11, %rbx
+; AVX512VL-NEXT:    shldq %cl, %r10, %rbx
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    movq -24(%rsp,%r9), %rdi
+; AVX512VL-NEXT:    movq %rdi, %r10
+; AVX512VL-NEXT:    shldq %cl, %r11, %r10
+; AVX512VL-NEXT:    movq -64(%rsp,%r9), %r11
+; AVX512VL-NEXT:    movq -16(%rsp,%r9), %r14
+; AVX512VL-NEXT:    movq %r14, %r15
+; AVX512VL-NEXT:    shldq %cl, %rdi, %r15
+; AVX512VL-NEXT:    movq -8(%rsp,%r9), %rdi
+; AVX512VL-NEXT:    shldq %cl, %r14, %rdi
+; AVX512VL-NEXT:    shlxq %rcx, %r11, %r9
+; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512VL-NEXT:    shldq %cl, %r11, %rdx
+; AVX512VL-NEXT:    movq %rdi, 56(%rax)
+; AVX512VL-NEXT:    movq %r15, 48(%rax)
+; AVX512VL-NEXT:    movq %r10, 40(%rax)
+; AVX512VL-NEXT:    movq %rbx, 32(%rax)
+; AVX512VL-NEXT:    movq %r8, 24(%rax)
+; AVX512VL-NEXT:    movq %rsi, 16(%rax)
+; AVX512VL-NEXT:    movq %rdx, 8(%rax)
+; AVX512VL-NEXT:    movq %r9, (%rax)
+; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shl_i512:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %r15
+; AVX512VBMI-NEXT:    pushq %r14
+; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movl %eax, %ecx
+; AVX512VBMI-NEXT:    andl $63, %ecx
+; AVX512VBMI-NEXT:    shrl $3, %eax
+; AVX512VBMI-NEXT:    andl $56, %eax
+; AVX512VBMI-NEXT:    negl %eax
+; AVX512VBMI-NEXT:    movslq %eax, %r9
+; AVX512VBMI-NEXT:    movq -56(%rsp,%r9), %rdx
+; AVX512VBMI-NEXT:    movq -48(%rsp,%r9), %rax
+; AVX512VBMI-NEXT:    movq %rax, %rsi
+; AVX512VBMI-NEXT:    shldq %cl, %rdx, %rsi
+; AVX512VBMI-NEXT:    movq -40(%rsp,%r9), %r10
+; AVX512VBMI-NEXT:    movq %r10, %r8
+; AVX512VBMI-NEXT:    shldq %cl, %rax, %r8
+; AVX512VBMI-NEXT:    movq -32(%rsp,%r9), %r11
+; AVX512VBMI-NEXT:    movq %r11, %rbx
+; AVX512VBMI-NEXT:    shldq %cl, %r10, %rbx
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movq -24(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT:    movq %rdi, %r10
+; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
+; AVX512VBMI-NEXT:    movq -64(%rsp,%r9), %r11
+; AVX512VBMI-NEXT:    movq -16(%rsp,%r9), %r14
+; AVX512VBMI-NEXT:    movq %r14, %r15
+; AVX512VBMI-NEXT:    shldq %cl, %rdi, %r15
+; AVX512VBMI-NEXT:    movq -8(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT:    shldq %cl, %r14, %rdi
+; AVX512VBMI-NEXT:    shlxq %rcx, %r11, %r9
+; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512VBMI-NEXT:    shldq %cl, %r11, %rdx
+; AVX512VBMI-NEXT:    movq %rdi, 56(%rax)
+; AVX512VBMI-NEXT:    movq %r15, 48(%rax)
+; AVX512VBMI-NEXT:    movq %r10, 40(%rax)
+; AVX512VBMI-NEXT:    movq %rbx, 32(%rax)
+; AVX512VBMI-NEXT:    movq %r8, 24(%rax)
+; AVX512VBMI-NEXT:    movq %rsi, 16(%rax)
+; AVX512VBMI-NEXT:    movq %rdx, 8(%rax)
+; AVX512VBMI-NEXT:    movq %r9, (%rax)
+; AVX512VBMI-NEXT:    popq %rbx
+; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %r = shl i512 %a0, %a1
+  ret i512 %r
+}
 
-define <8 x i64> @shl_i512_1(<8 x i64> %a)  {
-; AVX512VL-LABEL: shl_i512_1:
+define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
+; SSE-LABEL: lshr_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    xorps %xmm1, %xmm1
+; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %eax
+; SSE-NEXT:    andl $56, %eax
+; SSE-NEXT:    movq -112(%rsp,%rax), %rdx
+; SSE-NEXT:    movq -120(%rsp,%rax), %r9
+; SSE-NEXT:    movq %r9, %rsi
+; SSE-NEXT:    shrdq %cl, %rdx, %rsi
+; SSE-NEXT:    movq -104(%rsp,%rax), %r8
+; SSE-NEXT:    shrdq %cl, %r8, %rdx
+; SSE-NEXT:    movq -96(%rsp,%rax), %r10
+; SSE-NEXT:    shrdq %cl, %r10, %r8
+; SSE-NEXT:    movq -88(%rsp,%rax), %r11
+; SSE-NEXT:    shrdq %cl, %r11, %r10
+; SSE-NEXT:    movq -80(%rsp,%rax), %rbx
+; SSE-NEXT:    shrdq %cl, %rbx, %r11
+; SSE-NEXT:    movq -72(%rsp,%rax), %r14
+; SSE-NEXT:    shrdq %cl, %r14, %rbx
+; SSE-NEXT:    movq -128(%rsp,%rax), %r15
+; SSE-NEXT:    shrdq %cl, %r9, %r15
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shrq %cl, %r14
+; SSE-NEXT:    movq %r14, 56(%rdi)
+; SSE-NEXT:    movq %rbx, 48(%rdi)
+; SSE-NEXT:    movq %r11, 40(%rdi)
+; SSE-NEXT:    movq %r10, 32(%rdi)
+; SSE-NEXT:    movq %r8, 24(%rdi)
+; SSE-NEXT:    movq %rdx, 16(%rdi)
+; SSE-NEXT:    movq %rsi, 8(%rdi)
+; SSE-NEXT:    movq %r15, (%rdi)
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: lshr_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %eax
+; AVX2-NEXT:    andl $56, %eax
+; AVX2-NEXT:    movq -112(%rsp,%rax), %rdx
+; AVX2-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX2-NEXT:    movq %r9, %rsi
+; AVX2-NEXT:    shrdq %cl, %rdx, %rsi
+; AVX2-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX2-NEXT:    shrdq %cl, %r8, %rdx
+; AVX2-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX2-NEXT:    shrdq %cl, %r10, %r8
+; AVX2-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX2-NEXT:    shrdq %cl, %r11, %r10
+; AVX2-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX2-NEXT:    shrdq %cl, %rbx, %r11
+; AVX2-NEXT:    movq -128(%rsp,%rax), %r14
+; AVX2-NEXT:    movq -72(%rsp,%rax), %r15
+; AVX2-NEXT:    shrdq %cl, %r15, %rbx
+; AVX2-NEXT:    shrdq %cl, %r9, %r14
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    shrxq %rcx, %r15, %rcx
+; AVX2-NEXT:    movq %rcx, 56(%rdi)
+; AVX2-NEXT:    movq %rbx, 48(%rdi)
+; AVX2-NEXT:    movq %r11, 40(%rdi)
+; AVX2-NEXT:    movq %r10, 32(%rdi)
+; AVX2-NEXT:    movq %r8, 24(%rdi)
+; AVX2-NEXT:    movq %rdx, 16(%rdi)
+; AVX2-NEXT:    movq %rsi, 8(%rdi)
+; AVX2-NEXT:    movq %r14, (%rdi)
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: lshr_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %r15
+; AVX512F-NEXT:    pushq %r14
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movl %eax, %ecx
+; AVX512F-NEXT:    andl $63, %ecx
+; AVX512F-NEXT:    shrl $3, %eax
+; AVX512F-NEXT:    andl $56, %eax
+; AVX512F-NEXT:    movq -112(%rsp,%rax), %rdx
+; AVX512F-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512F-NEXT:    movq %r9, %rsi
+; AVX512F-NEXT:    shrdq %cl, %rdx, %rsi
+; AVX512F-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512F-NEXT:    shrdq %cl, %r8, %rdx
+; AVX512F-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512F-NEXT:    shrdq %cl, %r10, %r8
+; AVX512F-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512F-NEXT:    shrdq %cl, %r11, %r10
+; AVX512F-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512F-NEXT:    movq -128(%rsp,%rax), %r14
+; AVX512F-NEXT:    movq -72(%rsp,%rax), %r15
+; AVX512F-NEXT:    shrdq %cl, %r15, %rbx
+; AVX512F-NEXT:    shrdq %cl, %r9, %r14
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    shrxq %rcx, %r15, %rcx
+; AVX512F-NEXT:    movq %rcx, 56(%rdi)
+; AVX512F-NEXT:    movq %rbx, 48(%rdi)
+; AVX512F-NEXT:    movq %r11, 40(%rdi)
+; AVX512F-NEXT:    movq %r10, 32(%rdi)
+; AVX512F-NEXT:    movq %r8, 24(%rdi)
+; AVX512F-NEXT:    movq %rdx, 16(%rdi)
+; AVX512F-NEXT:    movq %rsi, 8(%rdi)
+; AVX512F-NEXT:    movq %r14, (%rdi)
+; AVX512F-NEXT:    popq %rbx
+; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: lshr_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm0[3,4,5,6,7,0,1,2]
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT:    vpaddq %xmm0, %xmm0, %xmm3
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
-; AVX512VL-NEXT:    vpsrlq $63, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpaddq %xmm2, %xmm2, %xmm2
-; AVX512VL-NEXT:    vpor %xmm4, %xmm2, %xmm2
-; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
-; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm3
-; AVX512VL-NEXT:    vpaddq %ymm3, %ymm3, %ymm3
-; AVX512VL-NEXT:    vpsrlq $63, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpor %ymm1, %ymm3, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
-; AVX512VL-NEXT:    vpsrlq $63, %zmm0, %zmm2
-; AVX512VL-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VL-NEXT:    vpaddq %zmm0, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
+; AVX512VL-NEXT:    pushq %r15
+; AVX512VL-NEXT:    pushq %r14
+; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    andl $63, %ecx
+; AVX512VL-NEXT:    shrl $3, %eax
+; AVX512VL-NEXT:    andl $56, %eax
+; AVX512VL-NEXT:    movq -112(%rsp,%rax), %rdx
+; AVX512VL-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512VL-NEXT:    movq %r9, %rsi
+; AVX512VL-NEXT:    shrdq %cl, %rdx, %rsi
+; AVX512VL-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512VL-NEXT:    shrdq %cl, %r8, %rdx
+; AVX512VL-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512VL-NEXT:    shrdq %cl, %r10, %r8
+; AVX512VL-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
+; AVX512VL-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512VL-NEXT:    movq -72(%rsp,%rax), %r14
+; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
+; AVX512VL-NEXT:    movq -128(%rsp,%rax), %r15
+; AVX512VL-NEXT:    shrdq %cl, %r9, %r15
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    shrxq %rcx, %r14, %rcx
+; AVX512VL-NEXT:    movq %rcx, 56(%rdi)
+; AVX512VL-NEXT:    movq %rbx, 48(%rdi)
+; AVX512VL-NEXT:    movq %r11, 40(%rdi)
+; AVX512VL-NEXT:    movq %r10, 32(%rdi)
+; AVX512VL-NEXT:    movq %r8, 24(%rdi)
+; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
+; AVX512VL-NEXT:    movq %rsi, 8(%rdi)
+; AVX512VL-NEXT:    movq %r15, (%rdi)
+; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
-; AVX512VBMI-LABEL: shl_i512_1:
+; AVX512VBMI-LABEL: lshr_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; AVX512VBMI-NEXT:    vpshldq $1, %xmm3, %xmm2, %xmm3
-; AVX512VBMI-NEXT:    vpaddq %xmm0, %xmm0, %xmm4
-; AVX512VBMI-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512VBMI-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; AVX512VBMI-NEXT:    vpshldq $1, %ymm1, %ymm2, %ymm1
-; AVX512VBMI-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VBMI-NEXT:    vpshldq $1, %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
+; AVX512VBMI-NEXT:    pushq %r15
+; AVX512VBMI-NEXT:    pushq %r14
+; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movl %eax, %ecx
+; AVX512VBMI-NEXT:    andl $63, %ecx
+; AVX512VBMI-NEXT:    shrl $3, %eax
+; AVX512VBMI-NEXT:    andl $56, %eax
+; AVX512VBMI-NEXT:    movq -112(%rsp,%rax), %rdx
+; AVX512VBMI-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512VBMI-NEXT:    movq %r9, %rsi
+; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rsi
+; AVX512VBMI-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512VBMI-NEXT:    shrdq %cl, %r8, %rdx
+; AVX512VBMI-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r8
+; AVX512VBMI-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
+; AVX512VBMI-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512VBMI-NEXT:    movq -72(%rsp,%rax), %r14
+; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
+; AVX512VBMI-NEXT:    movq -128(%rsp,%rax), %r15
+; AVX512VBMI-NEXT:    shrdq %cl, %r9, %r15
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    shrxq %rcx, %r14, %rcx
+; AVX512VBMI-NEXT:    movq %rcx, 56(%rdi)
+; AVX512VBMI-NEXT:    movq %rbx, 48(%rdi)
+; AVX512VBMI-NEXT:    movq %r11, 40(%rdi)
+; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
+; AVX512VBMI-NEXT:    movq %r8, 24(%rdi)
+; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
+; AVX512VBMI-NEXT:    movq %rsi, 8(%rdi)
+; AVX512VBMI-NEXT:    movq %r15, (%rdi)
+; AVX512VBMI-NEXT:    popq %rbx
+; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
+  %r = lshr i512 %a0, %a1
+  ret i512 %r
+}
+
+define i512 @ashr_i512(i512 %a0, i512 %a1) nounwind {
+; SSE-LABEL: ashr_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    sarq $63, %r10
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %eax, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %eax
+; SSE-NEXT:    andl $56, %eax
+; SSE-NEXT:    movq -112(%rsp,%rax), %rdx
+; SSE-NEXT:    movq -120(%rsp,%rax), %r9
+; SSE-NEXT:    movq %r9, %rsi
+; SSE-NEXT:    shrdq %cl, %rdx, %rsi
+; SSE-NEXT:    movq -104(%rsp,%rax), %r8
+; SSE-NEXT:    shrdq %cl, %r8, %rdx
+; SSE-NEXT:    movq -96(%rsp,%rax), %r10
+; SSE-NEXT:    shrdq %cl, %r10, %r8
+; SSE-NEXT:    movq -88(%rsp,%rax), %r11
+; SSE-NEXT:    shrdq %cl, %r11, %r10
+; SSE-NEXT:    movq -80(%rsp,%rax), %rbx
+; SSE-NEXT:    shrdq %cl, %rbx, %r11
+; SSE-NEXT:    movq -72(%rsp,%rax), %r14
+; SSE-NEXT:    shrdq %cl, %r14, %rbx
+; SSE-NEXT:    movq -128(%rsp,%rax), %r15
+; SSE-NEXT:    shrdq %cl, %r9, %r15
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    sarq %cl, %r14
+; SSE-NEXT:    movq %r14, 56(%rdi)
+; SSE-NEXT:    movq %rbx, 48(%rdi)
+; SSE-NEXT:    movq %r11, 40(%rdi)
+; SSE-NEXT:    movq %r10, 32(%rdi)
+; SSE-NEXT:    movq %r8, 24(%rdi)
+; SSE-NEXT:    movq %rdx, 16(%rdi)
+; SSE-NEXT:    movq %rsi, 8(%rdi)
+; SSE-NEXT:    movq %r15, (%rdi)
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ashr_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    sarq $63, %r10
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %eax
+; AVX2-NEXT:    andl $56, %eax
+; AVX2-NEXT:    movq -112(%rsp,%rax), %rdx
+; AVX2-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX2-NEXT:    movq %r9, %rsi
+; AVX2-NEXT:    shrdq %cl, %rdx, %rsi
+; AVX2-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX2-NEXT:    shrdq %cl, %r8, %rdx
+; AVX2-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX2-NEXT:    shrdq %cl, %r10, %r8
+; AVX2-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX2-NEXT:    shrdq %cl, %r11, %r10
+; AVX2-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX2-NEXT:    shrdq %cl, %rbx, %r11
+; AVX2-NEXT:    movq -128(%rsp,%rax), %r14
+; AVX2-NEXT:    movq -72(%rsp,%rax), %r15
+; AVX2-NEXT:    shrdq %cl, %r15, %rbx
+; AVX2-NEXT:    shrdq %cl, %r9, %r14
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    sarxq %rcx, %r15, %rcx
+; AVX2-NEXT:    movq %rcx, 56(%rdi)
+; AVX2-NEXT:    movq %rbx, 48(%rdi)
+; AVX2-NEXT:    movq %r11, 40(%rdi)
+; AVX2-NEXT:    movq %r10, 32(%rdi)
+; AVX2-NEXT:    movq %r8, 24(%rdi)
+; AVX2-NEXT:    movq %rdx, 16(%rdi)
+; AVX2-NEXT:    movq %rsi, 8(%rdi)
+; AVX2-NEXT:    movq %r14, (%rdi)
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: ashr_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %r15
+; AVX512F-NEXT:    pushq %r14
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    sarq $63, %r10
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movl %eax, %ecx
+; AVX512F-NEXT:    andl $63, %ecx
+; AVX512F-NEXT:    shrl $3, %eax
+; AVX512F-NEXT:    andl $56, %eax
+; AVX512F-NEXT:    movq -112(%rsp,%rax), %rdx
+; AVX512F-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512F-NEXT:    movq %r9, %rsi
+; AVX512F-NEXT:    shrdq %cl, %rdx, %rsi
+; AVX512F-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512F-NEXT:    shrdq %cl, %r8, %rdx
+; AVX512F-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512F-NEXT:    shrdq %cl, %r10, %r8
+; AVX512F-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512F-NEXT:    shrdq %cl, %r11, %r10
+; AVX512F-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512F-NEXT:    movq -128(%rsp,%rax), %r14
+; AVX512F-NEXT:    movq -72(%rsp,%rax), %r15
+; AVX512F-NEXT:    shrdq %cl, %r15, %rbx
+; AVX512F-NEXT:    shrdq %cl, %r9, %r14
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    sarxq %rcx, %r15, %rcx
+; AVX512F-NEXT:    movq %rcx, 56(%rdi)
+; AVX512F-NEXT:    movq %rbx, 48(%rdi)
+; AVX512F-NEXT:    movq %r11, 40(%rdi)
+; AVX512F-NEXT:    movq %r10, 32(%rdi)
+; AVX512F-NEXT:    movq %r8, 24(%rdi)
+; AVX512F-NEXT:    movq %rdx, 16(%rdi)
+; AVX512F-NEXT:    movq %rsi, 8(%rdi)
+; AVX512F-NEXT:    movq %r14, (%rdi)
+; AVX512F-NEXT:    popq %rbx
+; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: ashr_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %r15
+; AVX512VL-NEXT:    pushq %r14
+; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VL-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    sarq $63, %r10
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movl %eax, %ecx
+; AVX512VL-NEXT:    andl $63, %ecx
+; AVX512VL-NEXT:    shrl $3, %eax
+; AVX512VL-NEXT:    andl $56, %eax
+; AVX512VL-NEXT:    movq -112(%rsp,%rax), %rdx
+; AVX512VL-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512VL-NEXT:    movq %r9, %rsi
+; AVX512VL-NEXT:    shrdq %cl, %rdx, %rsi
+; AVX512VL-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512VL-NEXT:    shrdq %cl, %r8, %rdx
+; AVX512VL-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512VL-NEXT:    shrdq %cl, %r10, %r8
+; AVX512VL-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
+; AVX512VL-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512VL-NEXT:    movq -72(%rsp,%rax), %r14
+; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
+; AVX512VL-NEXT:    movq -128(%rsp,%rax), %r15
+; AVX512VL-NEXT:    shrdq %cl, %r9, %r15
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    sarxq %rcx, %r14, %rcx
+; AVX512VL-NEXT:    movq %rcx, 56(%rdi)
+; AVX512VL-NEXT:    movq %rbx, 48(%rdi)
+; AVX512VL-NEXT:    movq %r11, 40(%rdi)
+; AVX512VL-NEXT:    movq %r10, 32(%rdi)
+; AVX512VL-NEXT:    movq %r8, 24(%rdi)
+; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
+; AVX512VL-NEXT:    movq %rsi, 8(%rdi)
+; AVX512VL-NEXT:    movq %r15, (%rdi)
+; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    retq
 ;
-; ZNVER4-LABEL: shl_i512_1:
-; ZNVER4:       # %bb.0:
-; ZNVER4-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
-; ZNVER4-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
-; ZNVER4-NEXT:    vpaddq %xmm0, %xmm0, %xmm4
-; ZNVER4-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; ZNVER4-NEXT:    vpshldq $1, %xmm3, %xmm2, %xmm3
-; ZNVER4-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
-; ZNVER4-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[2,3,2,3,6,7,6,7]
-; ZNVER4-NEXT:    vpshldq $1, %ymm1, %ymm2, %ymm1
-; ZNVER4-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; ZNVER4-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; ZNVER4-NEXT:    vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; ZNVER4-NEXT:    vpshldq $1, %zmm0, %zmm3, %zmm0
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm1[0],zmm0[0],zmm1[2],zmm0[2],zmm1[4],zmm0[4],zmm1[6],zmm0[6]
-; ZNVER4-NEXT:    retq
-  %d = bitcast <8 x i64> %a to i512
-  %s = shl i512 %d, 1
-  %r = bitcast i512 %s to <8 x i64>
-  ret <8 x i64> %r
+; AVX512VBMI-LABEL: ashr_i512:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %r15
+; AVX512VBMI-NEXT:    pushq %r14
+; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    sarq $63, %r10
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movl %eax, %ecx
+; AVX512VBMI-NEXT:    andl $63, %ecx
+; AVX512VBMI-NEXT:    shrl $3, %eax
+; AVX512VBMI-NEXT:    andl $56, %eax
+; AVX512VBMI-NEXT:    movq -112(%rsp,%rax), %rdx
+; AVX512VBMI-NEXT:    movq -120(%rsp,%rax), %r9
+; AVX512VBMI-NEXT:    movq %r9, %rsi
+; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rsi
+; AVX512VBMI-NEXT:    movq -104(%rsp,%rax), %r8
+; AVX512VBMI-NEXT:    shrdq %cl, %r8, %rdx
+; AVX512VBMI-NEXT:    movq -96(%rsp,%rax), %r10
+; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r8
+; AVX512VBMI-NEXT:    movq -88(%rsp,%rax), %r11
+; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
+; AVX512VBMI-NEXT:    movq -80(%rsp,%rax), %rbx
+; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512VBMI-NEXT:    movq -72(%rsp,%rax), %r14
+; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
+; AVX512VBMI-NEXT:    movq -128(%rsp,%rax), %r15
+; AVX512VBMI-NEXT:    shrdq %cl, %r9, %r15
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    sarxq %rcx, %r14, %rcx
+; AVX512VBMI-NEXT:    movq %rcx, 56(%rdi)
+; AVX512VBMI-NEXT:    movq %rbx, 48(%rdi)
+; AVX512VBMI-NEXT:    movq %r11, 40(%rdi)
+; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
+; AVX512VBMI-NEXT:    movq %r8, 24(%rdi)
+; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
+; AVX512VBMI-NEXT:    movq %rsi, 8(%rdi)
+; AVX512VBMI-NEXT:    movq %r15, (%rdi)
+; AVX512VBMI-NEXT:    popq %rbx
+; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    retq
+  %r = ashr i512 %a0, %a1
+  ret i512 %r
+}
+
+define i512 @shl_i512_1(i512 %a0) nounwind {
+; CHECK-LABEL: shl_i512_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; CHECK-NEXT:    shldq $1, %rdi, %r10
+; CHECK-NEXT:    shldq $1, %r11, %rdi
+; CHECK-NEXT:    shldq $1, %r9, %r11
+; CHECK-NEXT:    shldq $1, %r8, %r9
+; CHECK-NEXT:    shldq $1, %rcx, %r8
+; CHECK-NEXT:    shldq $1, %rdx, %rcx
+; CHECK-NEXT:    shldq $1, %rsi, %rdx
+; CHECK-NEXT:    addq %rsi, %rsi
+; CHECK-NEXT:    movq %r10, 56(%rax)
+; CHECK-NEXT:    movq %rdi, 48(%rax)
+; CHECK-NEXT:    movq %r11, 40(%rax)
+; CHECK-NEXT:    movq %r9, 32(%rax)
+; CHECK-NEXT:    movq %r8, 24(%rax)
+; CHECK-NEXT:    movq %rcx, 16(%rax)
+; CHECK-NEXT:    movq %rdx, 8(%rax)
+; CHECK-NEXT:    movq %rsi, (%rax)
+; CHECK-NEXT:    retq
+  %r = shl i512 %a0, 1
+  ret i512 %r
+}
+
+define i512 @lshr_i512_1(i512 %a0) nounwind {
+; CHECK-LABEL: lshr_i512_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; CHECK-NEXT:    shrdq $1, %rdx, %rsi
+; CHECK-NEXT:    shrdq $1, %rcx, %rdx
+; CHECK-NEXT:    shrdq $1, %r8, %rcx
+; CHECK-NEXT:    shrdq $1, %r9, %r8
+; CHECK-NEXT:    shrdq $1, %r11, %r9
+; CHECK-NEXT:    shrdq $1, %rdi, %r11
+; CHECK-NEXT:    shrdq $1, %r10, %rdi
+; CHECK-NEXT:    shrq %r10
+; CHECK-NEXT:    movq %r10, 56(%rax)
+; CHECK-NEXT:    movq %rdi, 48(%rax)
+; CHECK-NEXT:    movq %r11, 40(%rax)
+; CHECK-NEXT:    movq %r9, 32(%rax)
+; CHECK-NEXT:    movq %r8, 24(%rax)
+; CHECK-NEXT:    movq %rcx, 16(%rax)
+; CHECK-NEXT:    movq %rdx, 8(%rax)
+; CHECK-NEXT:    movq %rsi, (%rax)
+; CHECK-NEXT:    retq
+  %r = lshr i512 %a0, 1
+  ret i512 %r
+}
+
+define i512 @ashr_i512_1(i512 %a0) nounwind {
+; CHECK-LABEL: ashr_i512_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; CHECK-NEXT:    shrdq $1, %rdx, %rsi
+; CHECK-NEXT:    shrdq $1, %rcx, %rdx
+; CHECK-NEXT:    shrdq $1, %r8, %rcx
+; CHECK-NEXT:    shrdq $1, %r9, %r8
+; CHECK-NEXT:    shrdq $1, %r11, %r9
+; CHECK-NEXT:    shrdq $1, %rdi, %r11
+; CHECK-NEXT:    shrdq $1, %r10, %rdi
+; CHECK-NEXT:    sarq %r10
+; CHECK-NEXT:    movq %r10, 56(%rax)
+; CHECK-NEXT:    movq %rdi, 48(%rax)
+; CHECK-NEXT:    movq %r11, 40(%rax)
+; CHECK-NEXT:    movq %r9, 32(%rax)
+; CHECK-NEXT:    movq %r8, 24(%rax)
+; CHECK-NEXT:    movq %rcx, 16(%rax)
+; CHECK-NEXT:    movq %rdx, 8(%rax)
+; CHECK-NEXT:    movq %rsi, (%rax)
+; CHECK-NEXT:    retq
+  %r = ashr i512 %a0, 1
+  ret i512 %r
+}
+
+define i512 @shl_i512_200(i512 %a0) nounwind {
+; SSE-LABEL: shl_i512_200:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    shldq $8, %r8, %r9
+; SSE-NEXT:    shldq $8, %rcx, %r8
+; SSE-NEXT:    shldq $8, %rdx, %rcx
+; SSE-NEXT:    shldq $8, %rsi, %rdx
+; SSE-NEXT:    shlq $8, %rsi
+; SSE-NEXT:    movq %r9, 56(%rdi)
+; SSE-NEXT:    movq %r8, 48(%rdi)
+; SSE-NEXT:    movq %rcx, 40(%rdi)
+; SSE-NEXT:    movq %rdx, 32(%rdi)
+; SSE-NEXT:    movq %rsi, 24(%rdi)
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, (%rdi)
+; SSE-NEXT:    movq $0, 16(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: shl_i512_200:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    shldq $8, %r8, %r9
+; AVX2-NEXT:    shldq $8, %rcx, %r8
+; AVX2-NEXT:    shldq $8, %rdx, %rcx
+; AVX2-NEXT:    shldq $8, %rsi, %rdx
+; AVX2-NEXT:    shlq $8, %rsi
+; AVX2-NEXT:    movq %r9, 56(%rdi)
+; AVX2-NEXT:    movq %r8, 48(%rdi)
+; AVX2-NEXT:    movq %rcx, 40(%rdi)
+; AVX2-NEXT:    movq %rdx, 32(%rdi)
+; AVX2-NEXT:    movq %rsi, 24(%rdi)
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX2-NEXT:    movq $0, 16(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: shl_i512_200:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq %rdi, %rax
+; AVX512-NEXT:    shldq $8, %r8, %r9
+; AVX512-NEXT:    shldq $8, %rcx, %r8
+; AVX512-NEXT:    shldq $8, %rdx, %rcx
+; AVX512-NEXT:    shldq $8, %rsi, %rdx
+; AVX512-NEXT:    shlq $8, %rsi
+; AVX512-NEXT:    movq %r9, 56(%rdi)
+; AVX512-NEXT:    movq %r8, 48(%rdi)
+; AVX512-NEXT:    movq %rcx, 40(%rdi)
+; AVX512-NEXT:    movq %rdx, 32(%rdi)
+; AVX512-NEXT:    movq %rsi, 24(%rdi)
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX512-NEXT:    movq $0, 16(%rdi)
+; AVX512-NEXT:    retq
+  %r = shl i512 %a0, 200
+  ret i512 %r
+}
+
+define i512 @lshr_i512_200(i512 %a0) nounwind {
+; SSE-LABEL: lshr_i512_200:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    shrdq $8, %r9, %r8
+; SSE-NEXT:    shrdq $8, %rsi, %r9
+; SSE-NEXT:    shrdq $8, %rcx, %rsi
+; SSE-NEXT:    shrdq $8, %rdx, %rcx
+; SSE-NEXT:    shrq $8, %rdx
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movups %xmm0, 40(%rdi)
+; SSE-NEXT:    movq %rdx, 32(%rdi)
+; SSE-NEXT:    movq %rcx, 24(%rdi)
+; SSE-NEXT:    movq %rsi, 16(%rdi)
+; SSE-NEXT:    movq %r9, 8(%rdi)
+; SSE-NEXT:    movq %r8, (%rdi)
+; SSE-NEXT:    movq $0, 56(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: lshr_i512_200:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT:    shrdq $8, %r9, %r8
+; AVX2-NEXT:    shrdq $8, %rsi, %r9
+; AVX2-NEXT:    shrdq $8, %rcx, %rsi
+; AVX2-NEXT:    shrdq $8, %rdx, %rcx
+; AVX2-NEXT:    shrq $8, %rdx
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %xmm0, 40(%rdi)
+; AVX2-NEXT:    movq %rdx, 32(%rdi)
+; AVX2-NEXT:    movq %rcx, 24(%rdi)
+; AVX2-NEXT:    movq %rsi, 16(%rdi)
+; AVX2-NEXT:    movq %r9, 8(%rdi)
+; AVX2-NEXT:    movq %r8, (%rdi)
+; AVX2-NEXT:    movq $0, 56(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: lshr_i512_200:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq %rdi, %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT:    shrdq $8, %r9, %r8
+; AVX512-NEXT:    shrdq $8, %rsi, %r9
+; AVX512-NEXT:    shrdq $8, %rcx, %rsi
+; AVX512-NEXT:    shrdq $8, %rdx, %rcx
+; AVX512-NEXT:    shrq $8, %rdx
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %xmm0, 40(%rdi)
+; AVX512-NEXT:    movq %rdx, 32(%rdi)
+; AVX512-NEXT:    movq %rcx, 24(%rdi)
+; AVX512-NEXT:    movq %rsi, 16(%rdi)
+; AVX512-NEXT:    movq %r9, 8(%rdi)
+; AVX512-NEXT:    movq %r8, (%rdi)
+; AVX512-NEXT:    movq $0, 56(%rdi)
+; AVX512-NEXT:    retq
+  %r = lshr i512 %a0, 200
+  ret i512 %r
+}
+
+define i512 @ashr_i512_200(i512 %a0) nounwind {
+; CHECK-LABEL: ashr_i512_200:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    shrdq $8, %r9, %r8
+; CHECK-NEXT:    shrdq $8, %rsi, %r9
+; CHECK-NEXT:    shrdq $8, %rcx, %rsi
+; CHECK-NEXT:    shrdq $8, %rdx, %rcx
+; CHECK-NEXT:    movq %rdx, %rdi
+; CHECK-NEXT:    sarq $8, %rdi
+; CHECK-NEXT:    sarq $63, %rdx
+; CHECK-NEXT:    movq %rdx, 56(%rax)
+; CHECK-NEXT:    movq %rdx, 48(%rax)
+; CHECK-NEXT:    movq %rdx, 40(%rax)
+; CHECK-NEXT:    movq %rdi, 32(%rax)
+; CHECK-NEXT:    movq %rcx, 24(%rax)
+; CHECK-NEXT:    movq %rsi, 16(%rax)
+; CHECK-NEXT:    movq %r9, 8(%rax)
+; CHECK-NEXT:    movq %r8, (%rax)
+; CHECK-NEXT:    retq
+  %r = ashr i512 %a0, 200
+  ret i512 %r
 }
 
-define <8 x i64> @lshr_i512_1(<8 x i64> %a)  {
-; AVX512VL-LABEL: lshr_i512_1:
+define i512 @shl_i512_511(i512 %a0) nounwind {
+; SSE-LABEL: shl_i512_511:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    shlq $63, %rsi
+; SSE-NEXT:    movq %rsi, 56(%rdi)
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, 32(%rdi)
+; SSE-NEXT:    movaps %xmm0, 16(%rdi)
+; SSE-NEXT:    movaps %xmm0, (%rdi)
+; SSE-NEXT:    movq $0, 48(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: shl_i512_511:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    shlq $63, %rsi
+; AVX2-NEXT:    movq %rsi, 56(%rdi)
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovaps %xmm0, 32(%rdi)
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, (%rdi)
+; AVX2-NEXT:    movq $0, 48(%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: shl_i512_511:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    shlq $63, %rsi
+; AVX512F-NEXT:    movq %rsi, 56(%rdi)
+; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovaps %xmm0, 32(%rdi)
+; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512F-NEXT:    movq $0, 48(%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shl_i512_511:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    shlq $63, %rsi
+; AVX512VL-NEXT:    movq %rsi, 56(%rdi)
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovaps %xmm0, 32(%rdi)
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512VL-NEXT:    movq $0, 48(%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shl_i512_511:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    shlq $63, %rsi
+; AVX512VBMI-NEXT:    movq %rsi, 56(%rdi)
+; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vmovaps %xmm0, 32(%rdi)
+; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512VBMI-NEXT:    movq $0, 48(%rdi)
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %r = shl i512 %a0, 511
+  ret i512 %r
+}
+
+define i512 @lshr_i512_511(i512 %a0) nounwind {
+; SSE-LABEL: lshr_i512_511:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    shrq $63, %rcx
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movups %xmm0, 40(%rdi)
+; SSE-NEXT:    movups %xmm0, 24(%rdi)
+; SSE-NEXT:    movups %xmm0, 8(%rdi)
+; SSE-NEXT:    movq %rcx, (%rdi)
+; SSE-NEXT:    movq $0, 56(%rdi)
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: lshr_i512_511:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT:    shrq $63, %rcx
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %xmm0, 40(%rdi)
+; AVX2-NEXT:    movq %rcx, (%rdi)
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, 8(%rdi)
+; AVX2-NEXT:    movq $0, 56(%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: lshr_i512_511:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT:    shrq $63, %rcx
+; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovups %xmm0, 40(%rdi)
+; AVX512F-NEXT:    movq %rcx, (%rdi)
+; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovups %ymm0, 8(%rdi)
+; AVX512F-NEXT:    movq $0, 56(%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: lshr_i512_511:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512VL-NEXT:    vpsllq $63, %xmm3, %xmm4
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX512VL-NEXT:    vpsrlq $1, %xmm5, %xmm5
-; AVX512VL-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; AVX512VL-NEXT:    vpsrlq $1, %xmm3, %xmm3
-; AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsllq $63, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512VL-NEXT:    vpsrlq $1, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512VL-NEXT:    vpsrlq $1, %zmm0, %zmm2
-; AVX512VL-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VL-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovups %xmm0, 40(%rdi)
+; AVX512VL-NEXT:    shrq $63, %rcx
+; AVX512VL-NEXT:    movq %rcx, (%rdi)
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovups %ymm0, 8(%rdi)
+; AVX512VL-NEXT:    movq $0, 56(%rdi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
-; AVX512VBMI-LABEL: lshr_i512_1:
+; AVX512VBMI-LABEL: lshr_i512_511:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VBMI-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VBMI-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX512VBMI-NEXT:    vpshldq $63, %xmm4, %xmm2, %xmm4
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512VBMI-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512VBMI-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512VBMI-NEXT:    vpshldq $63, %ymm3, %ymm1, %ymm1
-; AVX512VBMI-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VBMI-NEXT:    vpshldq $63, %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vmovups %xmm0, 40(%rdi)
+; AVX512VBMI-NEXT:    shrq $63, %rcx
+; AVX512VBMI-NEXT:    movq %rcx, (%rdi)
+; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vmovups %ymm0, 8(%rdi)
+; AVX512VBMI-NEXT:    movq $0, 56(%rdi)
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
+  %r = lshr i512 %a0, 511
+  ret i512 %r
+}
+
+define i512 @ashr_i512_511(i512 %a0) nounwind {
+; CHECK-LABEL: ashr_i512_511:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    sarq $63, %rcx
+; CHECK-NEXT:    movq %rcx, 56(%rdi)
+; CHECK-NEXT:    movq %rcx, 48(%rdi)
+; CHECK-NEXT:    movq %rcx, 40(%rdi)
+; CHECK-NEXT:    movq %rcx, 32(%rdi)
+; CHECK-NEXT:    movq %rcx, 24(%rdi)
+; CHECK-NEXT:    movq %rcx, 16(%rdi)
+; CHECK-NEXT:    movq %rcx, 8(%rdi)
+; CHECK-NEXT:    movq %rcx, (%rdi)
+; CHECK-NEXT:    retq
+  %r = ashr i512 %a0, 511
+  ret i512 %r
+}
+
+define i512 @shl_1_i512(i512 %a0) nounwind {
+; SSE-LABEL: shl_1_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movups %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    andl $56, %esi
+; SSE-NEXT:    negl %esi
+; SSE-NEXT:    movslq %esi, %rax
+; SSE-NEXT:    movq -56(%rsp,%rax), %rdx
+; SSE-NEXT:    movq -48(%rsp,%rax), %r9
+; SSE-NEXT:    movq %r9, %rsi
+; SSE-NEXT:    shldq %cl, %rdx, %rsi
+; SSE-NEXT:    movq -40(%rsp,%rax), %r10
+; SSE-NEXT:    movq %r10, %r8
+; SSE-NEXT:    shldq %cl, %r9, %r8
+; SSE-NEXT:    movq -32(%rsp,%rax), %r9
+; SSE-NEXT:    movq %r9, %r11
+; SSE-NEXT:    shldq %cl, %r10, %r11
+; SSE-NEXT:    movq -24(%rsp,%rax), %r10
+; SSE-NEXT:    movq %r10, %rbx
+; SSE-NEXT:    shldq %cl, %r9, %rbx
+; SSE-NEXT:    movq -16(%rsp,%rax), %r9
+; SSE-NEXT:    movq %r9, %r14
+; SSE-NEXT:    shldq %cl, %r10, %r14
+; SSE-NEXT:    movq -8(%rsp,%rax), %r10
+; SSE-NEXT:    shldq %cl, %r9, %r10
+; SSE-NEXT:    movq -64(%rsp,%rax), %rax
+; SSE-NEXT:    movq %rax, %r9
+; SSE-NEXT:    shlq %cl, %r9
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shldq %cl, %rax, %rdx
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq %r10, 56(%rdi)
+; SSE-NEXT:    movq %r14, 48(%rdi)
+; SSE-NEXT:    movq %rbx, 40(%rdi)
+; SSE-NEXT:    movq %r11, 32(%rdi)
+; SSE-NEXT:    movq %r8, 24(%rdi)
+; SSE-NEXT:    movq %rsi, 16(%rdi)
+; SSE-NEXT:    movq %rdx, 8(%rdi)
+; SSE-NEXT:    movq %r9, (%rdi)
+; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: shl_1_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %esi
+; AVX2-NEXT:    andl $56, %esi
+; AVX2-NEXT:    negl %esi
+; AVX2-NEXT:    movslq %esi, %r8
+; AVX2-NEXT:    movq -56(%rsp,%r8), %rdx
+; AVX2-NEXT:    movq -48(%rsp,%r8), %rax
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    shldq %cl, %rdx, %rsi
+; AVX2-NEXT:    movq -40(%rsp,%r8), %r10
+; AVX2-NEXT:    movq %r10, %r9
+; AVX2-NEXT:    shldq %cl, %rax, %r9
+; AVX2-NEXT:    movq -32(%rsp,%r8), %rax
+; AVX2-NEXT:    movq %rax, %r11
+; AVX2-NEXT:    shldq %cl, %r10, %r11
+; AVX2-NEXT:    movq -24(%rsp,%r8), %r10
+; AVX2-NEXT:    movq %r10, %rbx
+; AVX2-NEXT:    shldq %cl, %rax, %rbx
+; AVX2-NEXT:    movq -16(%rsp,%r8), %rax
+; AVX2-NEXT:    movq %rax, %r14
+; AVX2-NEXT:    shldq %cl, %r10, %r14
+; AVX2-NEXT:    movq -8(%rsp,%r8), %r10
+; AVX2-NEXT:    shldq %cl, %rax, %r10
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movq -64(%rsp,%r8), %rdi
+; AVX2-NEXT:    shlxq %rcx, %rdi, %r8
+; AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX2-NEXT:    shldq %cl, %rdi, %rdx
+; AVX2-NEXT:    movq %r10, 56(%rax)
+; AVX2-NEXT:    movq %r14, 48(%rax)
+; AVX2-NEXT:    movq %rbx, 40(%rax)
+; AVX2-NEXT:    movq %r11, 32(%rax)
+; AVX2-NEXT:    movq %r9, 24(%rax)
+; AVX2-NEXT:    movq %rsi, 16(%rax)
+; AVX2-NEXT:    movq %rdx, 8(%rax)
+; AVX2-NEXT:    movq %r8, (%rax)
+; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
-; ZNVER4-LABEL: lshr_i512_1:
-; ZNVER4:       # %bb.0:
-; ZNVER4-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; ZNVER4-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; ZNVER4-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; ZNVER4-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; ZNVER4-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
-; ZNVER4-NEXT:    vpshldq $63, %xmm4, %xmm2, %xmm4
-; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; ZNVER4-NEXT:    vpshldq $63, %ymm3, %ymm1, %ymm1
-; ZNVER4-NEXT:    vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; ZNVER4-NEXT:    vpsrlq $1, %xmm2, %xmm2
-; ZNVER4-NEXT:    vpshldq $63, %zmm0, %zmm3, %zmm0
-; ZNVER4-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; ZNVER4-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; ZNVER4-NEXT:    retq
-  %d = bitcast <8 x i64> %a to i512
-  %s = lshr i512 %d, 1
-  %r = bitcast i512 %s to <8 x i64>
-  ret <8 x i64> %r
+; AVX512F-LABEL: shl_1_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %r14
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movl %esi, %ecx
+; AVX512F-NEXT:    andl $63, %ecx
+; AVX512F-NEXT:    shrl $3, %esi
+; AVX512F-NEXT:    andl $56, %esi
+; AVX512F-NEXT:    negl %esi
+; AVX512F-NEXT:    movslq %esi, %r8
+; AVX512F-NEXT:    movq -56(%rsp,%r8), %rdx
+; AVX512F-NEXT:    movq -48(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq %rax, %rsi
+; AVX512F-NEXT:    shldq %cl, %rdx, %rsi
+; AVX512F-NEXT:    movq -40(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq %r10, %r9
+; AVX512F-NEXT:    shldq %cl, %rax, %r9
+; AVX512F-NEXT:    movq -32(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq %rax, %r11
+; AVX512F-NEXT:    shldq %cl, %r10, %r11
+; AVX512F-NEXT:    movq -24(%rsp,%r8), %r10
+; AVX512F-NEXT:    movq %r10, %rbx
+; AVX512F-NEXT:    shldq %cl, %rax, %rbx
+; AVX512F-NEXT:    movq -16(%rsp,%r8), %rax
+; AVX512F-NEXT:    movq %rax, %r14
+; AVX512F-NEXT:    shldq %cl, %r10, %r14
+; AVX512F-NEXT:    movq -8(%rsp,%r8), %r10
+; AVX512F-NEXT:    shldq %cl, %rax, %r10
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movq -64(%rsp,%r8), %rdi
+; AVX512F-NEXT:    shlxq %rcx, %rdi, %r8
+; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512F-NEXT:    shldq %cl, %rdi, %rdx
+; AVX512F-NEXT:    movq %r10, 56(%rax)
+; AVX512F-NEXT:    movq %r14, 48(%rax)
+; AVX512F-NEXT:    movq %rbx, 40(%rax)
+; AVX512F-NEXT:    movq %r11, 32(%rax)
+; AVX512F-NEXT:    movq %r9, 24(%rax)
+; AVX512F-NEXT:    movq %rsi, 16(%rax)
+; AVX512F-NEXT:    movq %rdx, 8(%rax)
+; AVX512F-NEXT:    movq %r8, (%rax)
+; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    popq %rbx
+; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: shl_1_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %r15
+; AVX512VL-NEXT:    pushq %r14
+; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movl %esi, %ecx
+; AVX512VL-NEXT:    andl $63, %ecx
+; AVX512VL-NEXT:    shrl $3, %esi
+; AVX512VL-NEXT:    andl $56, %esi
+; AVX512VL-NEXT:    negl %esi
+; AVX512VL-NEXT:    movslq %esi, %r9
+; AVX512VL-NEXT:    movq -56(%rsp,%r9), %rdx
+; AVX512VL-NEXT:    movq -48(%rsp,%r9), %rax
+; AVX512VL-NEXT:    movq %rax, %rsi
+; AVX512VL-NEXT:    shldq %cl, %rdx, %rsi
+; AVX512VL-NEXT:    movq -40(%rsp,%r9), %r10
+; AVX512VL-NEXT:    movq %r10, %r8
+; AVX512VL-NEXT:    shldq %cl, %rax, %r8
+; AVX512VL-NEXT:    movq -32(%rsp,%r9), %r11
+; AVX512VL-NEXT:    movq %r11, %rbx
+; AVX512VL-NEXT:    shldq %cl, %r10, %rbx
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    movq -24(%rsp,%r9), %rdi
+; AVX512VL-NEXT:    movq %rdi, %r10
+; AVX512VL-NEXT:    shldq %cl, %r11, %r10
+; AVX512VL-NEXT:    movq -64(%rsp,%r9), %r11
+; AVX512VL-NEXT:    movq -16(%rsp,%r9), %r14
+; AVX512VL-NEXT:    movq %r14, %r15
+; AVX512VL-NEXT:    shldq %cl, %rdi, %r15
+; AVX512VL-NEXT:    movq -8(%rsp,%r9), %rdi
+; AVX512VL-NEXT:    shldq %cl, %r14, %rdi
+; AVX512VL-NEXT:    shlxq %rcx, %r11, %r9
+; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512VL-NEXT:    shldq %cl, %r11, %rdx
+; AVX512VL-NEXT:    movq %rdi, 56(%rax)
+; AVX512VL-NEXT:    movq %r15, 48(%rax)
+; AVX512VL-NEXT:    movq %r10, 40(%rax)
+; AVX512VL-NEXT:    movq %rbx, 32(%rax)
+; AVX512VL-NEXT:    movq %r8, 24(%rax)
+; AVX512VL-NEXT:    movq %rsi, 16(%rax)
+; AVX512VL-NEXT:    movq %rdx, 8(%rax)
+; AVX512VL-NEXT:    movq %r9, (%rax)
+; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: shl_1_i512:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %r15
+; AVX512VBMI-NEXT:    pushq %r14
+; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps {{.*#+}} xmm0 = [1,0,0,0]
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movl %esi, %ecx
+; AVX512VBMI-NEXT:    andl $63, %ecx
+; AVX512VBMI-NEXT:    shrl $3, %esi
+; AVX512VBMI-NEXT:    andl $56, %esi
+; AVX512VBMI-NEXT:    negl %esi
+; AVX512VBMI-NEXT:    movslq %esi, %r9
+; AVX512VBMI-NEXT:    movq -56(%rsp,%r9), %rdx
+; AVX512VBMI-NEXT:    movq -48(%rsp,%r9), %rax
+; AVX512VBMI-NEXT:    movq %rax, %rsi
+; AVX512VBMI-NEXT:    shldq %cl, %rdx, %rsi
+; AVX512VBMI-NEXT:    movq -40(%rsp,%r9), %r10
+; AVX512VBMI-NEXT:    movq %r10, %r8
+; AVX512VBMI-NEXT:    shldq %cl, %rax, %r8
+; AVX512VBMI-NEXT:    movq -32(%rsp,%r9), %r11
+; AVX512VBMI-NEXT:    movq %r11, %rbx
+; AVX512VBMI-NEXT:    shldq %cl, %r10, %rbx
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movq -24(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT:    movq %rdi, %r10
+; AVX512VBMI-NEXT:    shldq %cl, %r11, %r10
+; AVX512VBMI-NEXT:    movq -64(%rsp,%r9), %r11
+; AVX512VBMI-NEXT:    movq -16(%rsp,%r9), %r14
+; AVX512VBMI-NEXT:    movq %r14, %r15
+; AVX512VBMI-NEXT:    shldq %cl, %rdi, %r15
+; AVX512VBMI-NEXT:    movq -8(%rsp,%r9), %rdi
+; AVX512VBMI-NEXT:    shldq %cl, %r14, %rdi
+; AVX512VBMI-NEXT:    shlxq %rcx, %r11, %r9
+; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
+; AVX512VBMI-NEXT:    shldq %cl, %r11, %rdx
+; AVX512VBMI-NEXT:    movq %rdi, 56(%rax)
+; AVX512VBMI-NEXT:    movq %r15, 48(%rax)
+; AVX512VBMI-NEXT:    movq %r10, 40(%rax)
+; AVX512VBMI-NEXT:    movq %rbx, 32(%rax)
+; AVX512VBMI-NEXT:    movq %r8, 24(%rax)
+; AVX512VBMI-NEXT:    movq %rsi, 16(%rax)
+; AVX512VBMI-NEXT:    movq %rdx, 8(%rax)
+; AVX512VBMI-NEXT:    movq %r9, (%rax)
+; AVX512VBMI-NEXT:    popq %rbx
+; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %r = shl i512 1, %a0
+  ret i512 %r
 }
 
-define <8 x i64> @ashr_i512_1(<8 x i64> %a)  {
-; AVX512VL-LABEL: ashr_i512_1:
+define i512 @lshr_signbit_i512(i512 %a0) nounwind {
+; SSE-LABEL: lshr_signbit_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    andl $56, %esi
+; SSE-NEXT:    movq -112(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq -120(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, %r8
+; SSE-NEXT:    shrdq %cl, %rdx, %r8
+; SSE-NEXT:    movq -104(%rsp,%rsi), %r9
+; SSE-NEXT:    shrdq %cl, %r9, %rdx
+; SSE-NEXT:    movq -96(%rsp,%rsi), %r10
+; SSE-NEXT:    shrdq %cl, %r10, %r9
+; SSE-NEXT:    movq -88(%rsp,%rsi), %r11
+; SSE-NEXT:    shrdq %cl, %r11, %r10
+; SSE-NEXT:    movq -80(%rsp,%rsi), %rbx
+; SSE-NEXT:    shrdq %cl, %rbx, %r11
+; SSE-NEXT:    movq -72(%rsp,%rsi), %r14
+; SSE-NEXT:    shrdq %cl, %r14, %rbx
+; SSE-NEXT:    movq -128(%rsp,%rsi), %rsi
+; SSE-NEXT:    shrdq %cl, %rax, %rsi
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    shrq %cl, %r14
+; SSE-NEXT:    movq %r14, 56(%rdi)
+; SSE-NEXT:    movq %rbx, 48(%rdi)
+; SSE-NEXT:    movq %r11, 40(%rdi)
+; SSE-NEXT:    movq %r10, 32(%rdi)
+; SSE-NEXT:    movq %r9, 24(%rdi)
+; SSE-NEXT:    movq %rdx, 16(%rdi)
+; SSE-NEXT:    movq %r8, 8(%rdi)
+; SSE-NEXT:    movq %rsi, (%rdi)
+; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: lshr_signbit_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
+; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %esi
+; AVX2-NEXT:    andl $56, %esi
+; AVX2-NEXT:    movq -112(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    shrdq %cl, %rdx, %r8
+; AVX2-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX2-NEXT:    shrdq %cl, %r9, %rdx
+; AVX2-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX2-NEXT:    shrdq %cl, %r10, %r9
+; AVX2-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX2-NEXT:    shrdq %cl, %r11, %r10
+; AVX2-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX2-NEXT:    shrdq %cl, %rbx, %r11
+; AVX2-NEXT:    movq -128(%rsp,%rsi), %r14
+; AVX2-NEXT:    movq -72(%rsp,%rsi), %rsi
+; AVX2-NEXT:    shrdq %cl, %rsi, %rbx
+; AVX2-NEXT:    shrdq %cl, %rax, %r14
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX2-NEXT:    movq %rcx, 56(%rdi)
+; AVX2-NEXT:    movq %rbx, 48(%rdi)
+; AVX2-NEXT:    movq %r11, 40(%rdi)
+; AVX2-NEXT:    movq %r10, 32(%rdi)
+; AVX2-NEXT:    movq %r9, 24(%rdi)
+; AVX2-NEXT:    movq %rdx, 16(%rdi)
+; AVX2-NEXT:    movq %r8, 8(%rdi)
+; AVX2-NEXT:    movq %r14, (%rdi)
+; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: lshr_signbit_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %r14
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movl %esi, %ecx
+; AVX512F-NEXT:    andl $63, %ecx
+; AVX512F-NEXT:    shrl $3, %esi
+; AVX512F-NEXT:    andl $56, %esi
+; AVX512F-NEXT:    movq -112(%rsp,%rsi), %rdx
+; AVX512F-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512F-NEXT:    movq %rax, %r8
+; AVX512F-NEXT:    shrdq %cl, %rdx, %r8
+; AVX512F-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512F-NEXT:    shrdq %cl, %r9, %rdx
+; AVX512F-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512F-NEXT:    shrdq %cl, %r10, %r9
+; AVX512F-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512F-NEXT:    shrdq %cl, %r11, %r10
+; AVX512F-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512F-NEXT:    movq -128(%rsp,%rsi), %r14
+; AVX512F-NEXT:    movq -72(%rsp,%rsi), %rsi
+; AVX512F-NEXT:    shrdq %cl, %rsi, %rbx
+; AVX512F-NEXT:    shrdq %cl, %rax, %r14
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    shrxq %rcx, %rsi, %rcx
+; AVX512F-NEXT:    movq %rcx, 56(%rdi)
+; AVX512F-NEXT:    movq %rbx, 48(%rdi)
+; AVX512F-NEXT:    movq %r11, 40(%rdi)
+; AVX512F-NEXT:    movq %r10, 32(%rdi)
+; AVX512F-NEXT:    movq %r9, 24(%rdi)
+; AVX512F-NEXT:    movq %rdx, 16(%rdi)
+; AVX512F-NEXT:    movq %r8, 8(%rdi)
+; AVX512F-NEXT:    movq %r14, (%rdi)
+; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    popq %rbx
+; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: lshr_signbit_i512:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm3
-; AVX512VL-NEXT:    vpsllq $63, %xmm3, %xmm4
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
-; AVX512VL-NEXT:    vpsrlq $1, %xmm5, %xmm5
-; AVX512VL-NEXT:    vpor %xmm5, %xmm4, %xmm4
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
-; AVX512VL-NEXT:    vpsraq $1, %xmm3, %xmm3
-; AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpsllq $63, %ymm1, %ymm1
-; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512VL-NEXT:    vpsrlq $1, %ymm2, %ymm2
-; AVX512VL-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
-; AVX512VL-NEXT:    vpsrlq $1, %zmm0, %zmm2
-; AVX512VL-NEXT:    vpshufd {{.*#+}} zmm0 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VL-NEXT:    vpsllq $63, %zmm0, %zmm0
-; AVX512VL-NEXT:    vporq %zmm2, %zmm0, %zmm0
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512VL-NEXT:    pushq %r14
+; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    pushq %rax
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
+; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movl %esi, %ecx
+; AVX512VL-NEXT:    andl $63, %ecx
+; AVX512VL-NEXT:    shrl $3, %esi
+; AVX512VL-NEXT:    andl $56, %esi
+; AVX512VL-NEXT:    movq -112(%rsp,%rsi), %rdx
+; AVX512VL-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512VL-NEXT:    movq %rax, %r8
+; AVX512VL-NEXT:    shrdq %cl, %rdx, %r8
+; AVX512VL-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512VL-NEXT:    shrdq %cl, %r9, %rdx
+; AVX512VL-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512VL-NEXT:    shrdq %cl, %r10, %r9
+; AVX512VL-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
+; AVX512VL-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512VL-NEXT:    movq -72(%rsp,%rsi), %r14
+; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
+; AVX512VL-NEXT:    movq -128(%rsp,%rsi), %rsi
+; AVX512VL-NEXT:    shrdq %cl, %rax, %rsi
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    shrxq %rcx, %r14, %rcx
+; AVX512VL-NEXT:    movq %rcx, 56(%rdi)
+; AVX512VL-NEXT:    movq %rbx, 48(%rdi)
+; AVX512VL-NEXT:    movq %r11, 40(%rdi)
+; AVX512VL-NEXT:    movq %r10, 32(%rdi)
+; AVX512VL-NEXT:    movq %r9, 24(%rdi)
+; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
+; AVX512VL-NEXT:    movq %r8, 8(%rdi)
+; AVX512VL-NEXT:    movq %rsi, (%rdi)
+; AVX512VL-NEXT:    addq $8, %rsp
+; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
-; AVX512VBMI-LABEL: ashr_i512_1:
+; AVX512VBMI-LABEL: lshr_signbit_i512:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512VBMI-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VBMI-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; AVX512VBMI-NEXT:    vpshldq $63, %xmm4, %xmm2, %xmm4
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512VBMI-NEXT:    vpsraq $1, %xmm2, %xmm2
-; AVX512VBMI-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; AVX512VBMI-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
-; AVX512VBMI-NEXT:    vpshldq $63, %ymm3, %ymm1, %ymm1
-; AVX512VBMI-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VBMI-NEXT:    vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; AVX512VBMI-NEXT:    vpshldq $63, %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
+; AVX512VBMI-NEXT:    pushq %r14
+; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    pushq %rax
+; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movl %esi, %ecx
+; AVX512VBMI-NEXT:    andl $63, %ecx
+; AVX512VBMI-NEXT:    shrl $3, %esi
+; AVX512VBMI-NEXT:    andl $56, %esi
+; AVX512VBMI-NEXT:    movq -112(%rsp,%rsi), %rdx
+; AVX512VBMI-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512VBMI-NEXT:    movq %rax, %r8
+; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %r8
+; AVX512VBMI-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rdx
+; AVX512VBMI-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r9
+; AVX512VBMI-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
+; AVX512VBMI-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512VBMI-NEXT:    movq -72(%rsp,%rsi), %r14
+; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
+; AVX512VBMI-NEXT:    movq -128(%rsp,%rsi), %rsi
+; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rsi
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    shrxq %rcx, %r14, %rcx
+; AVX512VBMI-NEXT:    movq %rcx, 56(%rdi)
+; AVX512VBMI-NEXT:    movq %rbx, 48(%rdi)
+; AVX512VBMI-NEXT:    movq %r11, 40(%rdi)
+; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
+; AVX512VBMI-NEXT:    movq %r9, 24(%rdi)
+; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
+; AVX512VBMI-NEXT:    movq %r8, 8(%rdi)
+; AVX512VBMI-NEXT:    movq %rsi, (%rdi)
+; AVX512VBMI-NEXT:    addq $8, %rsp
+; AVX512VBMI-NEXT:    popq %rbx
+; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
+  %s = shl i512 1, 511
+  %r = lshr i512 %s, %a0
+  ret i512 %r
+}
+
+define i512 @ashr_signbit_i512(i512 %a0) nounwind {
+; SSE-LABEL: ashr_signbit_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    xorps %xmm0, %xmm0
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $-1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movl %esi, %ecx
+; SSE-NEXT:    andl $63, %ecx
+; SSE-NEXT:    shrl $3, %esi
+; SSE-NEXT:    andl $56, %esi
+; SSE-NEXT:    movq -112(%rsp,%rsi), %rdx
+; SSE-NEXT:    movq -120(%rsp,%rsi), %rax
+; SSE-NEXT:    movq %rax, %r8
+; SSE-NEXT:    shrdq %cl, %rdx, %r8
+; SSE-NEXT:    movq -104(%rsp,%rsi), %r9
+; SSE-NEXT:    shrdq %cl, %r9, %rdx
+; SSE-NEXT:    movq -96(%rsp,%rsi), %r10
+; SSE-NEXT:    shrdq %cl, %r10, %r9
+; SSE-NEXT:    movq -88(%rsp,%rsi), %r11
+; SSE-NEXT:    shrdq %cl, %r11, %r10
+; SSE-NEXT:    movq -80(%rsp,%rsi), %rbx
+; SSE-NEXT:    shrdq %cl, %rbx, %r11
+; SSE-NEXT:    movq -72(%rsp,%rsi), %r14
+; SSE-NEXT:    shrdq %cl, %r14, %rbx
+; SSE-NEXT:    movq -128(%rsp,%rsi), %rsi
+; SSE-NEXT:    shrdq %cl, %rax, %rsi
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE-NEXT:    sarq %cl, %r14
+; SSE-NEXT:    movq %r14, 56(%rdi)
+; SSE-NEXT:    movq %rbx, 48(%rdi)
+; SSE-NEXT:    movq %r11, 40(%rdi)
+; SSE-NEXT:    movq %r10, 32(%rdi)
+; SSE-NEXT:    movq %r9, 24(%rdi)
+; SSE-NEXT:    movq %rdx, 16(%rdi)
+; SSE-NEXT:    movq %r8, 8(%rdi)
+; SSE-NEXT:    movq %rsi, (%rdi)
+; SSE-NEXT:    addq $8, %rsp
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: ashr_signbit_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    pushq %rax
+; AVX2-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %esi, %ecx
+; AVX2-NEXT:    andl $63, %ecx
+; AVX2-NEXT:    shrl $3, %esi
+; AVX2-NEXT:    andl $56, %esi
+; AVX2-NEXT:    movq -112(%rsp,%rsi), %rdx
+; AVX2-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    shrdq %cl, %rdx, %r8
+; AVX2-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX2-NEXT:    shrdq %cl, %r9, %rdx
+; AVX2-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX2-NEXT:    shrdq %cl, %r10, %r9
+; AVX2-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX2-NEXT:    shrdq %cl, %r11, %r10
+; AVX2-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX2-NEXT:    shrdq %cl, %rbx, %r11
+; AVX2-NEXT:    movq -128(%rsp,%rsi), %r14
+; AVX2-NEXT:    movq -72(%rsp,%rsi), %rsi
+; AVX2-NEXT:    shrdq %cl, %rsi, %rbx
+; AVX2-NEXT:    shrdq %cl, %rax, %r14
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    sarxq %rcx, %rsi, %rcx
+; AVX2-NEXT:    movq %rcx, 56(%rdi)
+; AVX2-NEXT:    movq %rbx, 48(%rdi)
+; AVX2-NEXT:    movq %r11, 40(%rdi)
+; AVX2-NEXT:    movq %r10, 32(%rdi)
+; AVX2-NEXT:    movq %r9, 24(%rdi)
+; AVX2-NEXT:    movq %rdx, 16(%rdi)
+; AVX2-NEXT:    movq %r8, 8(%rdi)
+; AVX2-NEXT:    movq %r14, (%rdi)
+; AVX2-NEXT:    addq $8, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
 ;
-; ZNVER4-LABEL: ashr_i512_1:
-; ZNVER4:       # %bb.0:
-; ZNVER4-NEXT:    vextracti32x4 $2, %zmm0, %xmm3
-; ZNVER4-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; ZNVER4-NEXT:    vextracti32x4 $3, %zmm0, %xmm2
-; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
-; ZNVER4-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; ZNVER4-NEXT:    vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
-; ZNVER4-NEXT:    vpshldq $63, %xmm4, %xmm2, %xmm4
-; ZNVER4-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; ZNVER4-NEXT:    vpshldq $63, %ymm3, %ymm1, %ymm1
-; ZNVER4-NEXT:    vpshufd {{.*#+}} zmm3 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
-; ZNVER4-NEXT:    vpsraq $1, %xmm2, %xmm2
-; ZNVER4-NEXT:    vpshldq $63, %zmm0, %zmm3, %zmm0
-; ZNVER4-NEXT:    vinserti128 $1, %xmm2, %ymm4, %ymm2
-; ZNVER4-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; ZNVER4-NEXT:    vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
-; ZNVER4-NEXT:    retq
-  %d = bitcast <8 x i64> %a to i512
-  %s = ashr i512 %d, 1
-  %r = bitcast i512 %s to <8 x i64>
-  ret <8 x i64> %r
+; AVX512F-LABEL: ashr_signbit_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %r14
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    pushq %rax
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = -1
+; AVX512F-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,0,0,0,0,0,0,9223372036854775808]
+; AVX512F-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    movl %esi, %ecx
+; AVX512F-NEXT:    andl $63, %ecx
+; AVX512F-NEXT:    shrl $3, %esi
+; AVX512F-NEXT:    andl $56, %esi
+; AVX512F-NEXT:    movq -112(%rsp,%rsi), %rdx
+; AVX512F-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512F-NEXT:    movq %rax, %r8
+; AVX512F-NEXT:    shrdq %cl, %rdx, %r8
+; AVX512F-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512F-NEXT:    shrdq %cl, %r9, %rdx
+; AVX512F-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512F-NEXT:    shrdq %cl, %r10, %r9
+; AVX512F-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512F-NEXT:    shrdq %cl, %r11, %r10
+; AVX512F-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512F-NEXT:    movq -128(%rsp,%rsi), %r14
+; AVX512F-NEXT:    movq -72(%rsp,%rsi), %rsi
+; AVX512F-NEXT:    shrdq %cl, %rsi, %rbx
+; AVX512F-NEXT:    shrdq %cl, %rax, %r14
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    sarxq %rcx, %rsi, %rcx
+; AVX512F-NEXT:    movq %rcx, 56(%rdi)
+; AVX512F-NEXT:    movq %rbx, 48(%rdi)
+; AVX512F-NEXT:    movq %r11, 40(%rdi)
+; AVX512F-NEXT:    movq %r10, 32(%rdi)
+; AVX512F-NEXT:    movq %r9, 24(%rdi)
+; AVX512F-NEXT:    movq %rdx, 16(%rdi)
+; AVX512F-NEXT:    movq %r8, 8(%rdi)
+; AVX512F-NEXT:    movq %r14, (%rdi)
+; AVX512F-NEXT:    addq $8, %rsp
+; AVX512F-NEXT:    popq %rbx
+; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: ashr_signbit_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %r14
+; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    pushq %rax
+; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VL-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    movl %esi, %ecx
+; AVX512VL-NEXT:    andl $63, %ecx
+; AVX512VL-NEXT:    shrl $3, %esi
+; AVX512VL-NEXT:    andl $56, %esi
+; AVX512VL-NEXT:    movq -112(%rsp,%rsi), %rdx
+; AVX512VL-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512VL-NEXT:    movq %rax, %r8
+; AVX512VL-NEXT:    shrdq %cl, %rdx, %r8
+; AVX512VL-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512VL-NEXT:    shrdq %cl, %r9, %rdx
+; AVX512VL-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512VL-NEXT:    shrdq %cl, %r10, %r9
+; AVX512VL-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512VL-NEXT:    shrdq %cl, %r11, %r10
+; AVX512VL-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512VL-NEXT:    movq -72(%rsp,%rsi), %r14
+; AVX512VL-NEXT:    shrdq %cl, %r14, %rbx
+; AVX512VL-NEXT:    movq -128(%rsp,%rsi), %rsi
+; AVX512VL-NEXT:    shrdq %cl, %rax, %rsi
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    sarxq %rcx, %r14, %rcx
+; AVX512VL-NEXT:    movq %rcx, 56(%rdi)
+; AVX512VL-NEXT:    movq %rbx, 48(%rdi)
+; AVX512VL-NEXT:    movq %r11, 40(%rdi)
+; AVX512VL-NEXT:    movq %r10, 32(%rdi)
+; AVX512VL-NEXT:    movq %r9, 24(%rdi)
+; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
+; AVX512VL-NEXT:    movq %r8, 8(%rdi)
+; AVX512VL-NEXT:    movq %rsi, (%rdi)
+; AVX512VL-NEXT:    addq $8, %rsp
+; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: ashr_signbit_i512:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %r14
+; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    pushq %rax
+; AVX512VBMI-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
+; AVX512VBMI-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm0 = [0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512VBMI-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    movl %esi, %ecx
+; AVX512VBMI-NEXT:    andl $63, %ecx
+; AVX512VBMI-NEXT:    shrl $3, %esi
+; AVX512VBMI-NEXT:    andl $56, %esi
+; AVX512VBMI-NEXT:    movq -112(%rsp,%rsi), %rdx
+; AVX512VBMI-NEXT:    movq -120(%rsp,%rsi), %rax
+; AVX512VBMI-NEXT:    movq %rax, %r8
+; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %r8
+; AVX512VBMI-NEXT:    movq -104(%rsp,%rsi), %r9
+; AVX512VBMI-NEXT:    shrdq %cl, %r9, %rdx
+; AVX512VBMI-NEXT:    movq -96(%rsp,%rsi), %r10
+; AVX512VBMI-NEXT:    shrdq %cl, %r10, %r9
+; AVX512VBMI-NEXT:    movq -88(%rsp,%rsi), %r11
+; AVX512VBMI-NEXT:    shrdq %cl, %r11, %r10
+; AVX512VBMI-NEXT:    movq -80(%rsp,%rsi), %rbx
+; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
+; AVX512VBMI-NEXT:    movq -72(%rsp,%rsi), %r14
+; AVX512VBMI-NEXT:    shrdq %cl, %r14, %rbx
+; AVX512VBMI-NEXT:    movq -128(%rsp,%rsi), %rsi
+; AVX512VBMI-NEXT:    shrdq %cl, %rax, %rsi
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    sarxq %rcx, %r14, %rcx
+; AVX512VBMI-NEXT:    movq %rcx, 56(%rdi)
+; AVX512VBMI-NEXT:    movq %rbx, 48(%rdi)
+; AVX512VBMI-NEXT:    movq %r11, 40(%rdi)
+; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
+; AVX512VBMI-NEXT:    movq %r9, 24(%rdi)
+; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
+; AVX512VBMI-NEXT:    movq %r8, 8(%rdi)
+; AVX512VBMI-NEXT:    movq %rsi, (%rdi)
+; AVX512VBMI-NEXT:    addq $8, %rsp
+; AVX512VBMI-NEXT:    popq %rbx
+; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %s = shl i512 1, 511
+  %r = ashr i512 %s, %a0
+  ret i512 %r
 }



More information about the llvm-commits mailing list