[llvm] [X86] Add bitrevese/bswap i128/i256/i512 test coverage for #187353 (PR #187492)

Simon Pilgrim via llvm-commits llvm-commits at lists.llvm.org
Thu Mar 19 05:23:57 PDT 2026


https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/187492

None

>From 4fa9f4db915aa359f12339d8680dbcd6c716f87f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 19 Mar 2026 12:22:55 +0000
Subject: [PATCH] [X86] Add bitrevese/bswap i128/i256/i512 test coverage for
 #187353

---
 llvm/test/CodeGen/X86/bit-manip-i128.ll |  459 ++++
 llvm/test/CodeGen/X86/bit-manip-i256.ll | 1492 +++++++++++
 llvm/test/CodeGen/X86/bit-manip-i512.ll | 2987 ++++++++++++++++++++++-
 3 files changed, 4937 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/X86/bit-manip-i128.ll b/llvm/test/CodeGen/X86/bit-manip-i128.ll
index b00254a4aab3d..5d048bfead2f5 100644
--- a/llvm/test/CodeGen/X86/bit-manip-i128.ll
+++ b/llvm/test/CodeGen/X86/bit-manip-i128.ll
@@ -1091,5 +1091,464 @@ define i128 @isolate_msb_i128_load(ptr %p0, i128 %idx) nounwind {
   ret i128 %res
 }
 
+define i128 @bswap_i128(i128 %a0) nounwind {
+; CHECK-LABEL: bswap_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    bswapq %rax
+; CHECK-NEXT:    bswapq %rdi
+; CHECK-NEXT:    movq %rdi, %rdx
+; CHECK-NEXT:    retq
+  %res = call i128 @llvm.bswap.i128(i128 %a0)
+  ret i128 %res
+}
+
+define i128 @bswap_i128_vector(<2 x i64> %v0) nounwind {
+; SSE2-LABEL: bswap_i128_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    bswapq %rdx
+; SSE2-NEXT:    bswapq %rax
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: bswap_i128_vector:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %xmm0, %rdx
+; SSE42-NEXT:    pextrq $1, %xmm0, %rax
+; SSE42-NEXT:    bswapq %rax
+; SSE42-NEXT:    bswapq %rdx
+; SSE42-NEXT:    retq
+;
+; AVX-LABEL: bswap_i128_vector:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovq %xmm0, %rdx
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    bswapq %rax
+; AVX-NEXT:    bswapq %rdx
+; AVX-NEXT:    retq
+  %a0 = bitcast <2 x i64> %v0 to i128
+  %res = call i128 @llvm.bswap.i128(i128 %a0)
+  ret i128 %res
+}
+
+define i128 @bswap_i128_load(ptr %p0) nounwind {
+; SSE-LABEL: bswap_i128_load:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq (%rdi), %rdx
+; SSE-NEXT:    movq 8(%rdi), %rax
+; SSE-NEXT:    bswapq %rax
+; SSE-NEXT:    bswapq %rdx
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: bswap_i128_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movbeq 8(%rdi), %rax
+; AVX-NEXT:    movbeq (%rdi), %rdx
+; AVX-NEXT:    retq
+  %a0 = load i128, ptr %p0
+  %res = call i128 @llvm.bswap.i128(i128 %a0)
+  ret i128 %res
+}
+
+define i128 @bitreverse_i128(i128 %a0) nounwind {
+; CHECK-LABEL: bitreverse_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bswapq %rsi
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    shrq $4, %rax
+; CHECK-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    andq %rdx, %rsi
+; CHECK-NEXT:    shlq $4, %rsi
+; CHECK-NEXT:    orq %rax, %rsi
+; CHECK-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    andq %rcx, %rax
+; CHECK-NEXT:    shrq $2, %rsi
+; CHECK-NEXT:    andq %rcx, %rsi
+; CHECK-NEXT:    leaq (%rsi,%rax,4), %rax
+; CHECK-NEXT:    movabsq $6148914691236517205, %rsi # imm = 0x5555555555555555
+; CHECK-NEXT:    movq %rax, %r8
+; CHECK-NEXT:    andq %rsi, %r8
+; CHECK-NEXT:    shrq %rax
+; CHECK-NEXT:    andq %rsi, %rax
+; CHECK-NEXT:    leaq (%rax,%r8,2), %rax
+; CHECK-NEXT:    bswapq %rdi
+; CHECK-NEXT:    movq %rdi, %r8
+; CHECK-NEXT:    shrq $4, %r8
+; CHECK-NEXT:    andq %rdx, %r8
+; CHECK-NEXT:    andq %rdx, %rdi
+; CHECK-NEXT:    shlq $4, %rdi
+; CHECK-NEXT:    orq %r8, %rdi
+; CHECK-NEXT:    movq %rdi, %rdx
+; CHECK-NEXT:    andq %rcx, %rdx
+; CHECK-NEXT:    shrq $2, %rdi
+; CHECK-NEXT:    andq %rcx, %rdi
+; CHECK-NEXT:    leaq (%rdi,%rdx,4), %rcx
+; CHECK-NEXT:    movq %rcx, %rdx
+; CHECK-NEXT:    andq %rsi, %rdx
+; CHECK-NEXT:    shrq %rcx
+; CHECK-NEXT:    andq %rsi, %rcx
+; CHECK-NEXT:    leaq (%rcx,%rdx,2), %rdx
+; CHECK-NEXT:    retq
+  %res = call i128 @llvm.bitreverse.i128(i128 %a0)
+  ret i128 %res
+}
+
+define i128 @bitreverse_i128_vector(<2 x i64> %v0) nounwind {
+; SSE2-LABEL: bitreverse_i128_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    bswapq %rdx
+; SSE2-NEXT:    movq %xmm1, %rax
+; SSE2-NEXT:    movq %rdx, %rcx
+; SSE2-NEXT:    shrq $4, %rcx
+; SSE2-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; SSE2-NEXT:    andq %rsi, %rcx
+; SSE2-NEXT:    andq %rsi, %rdx
+; SSE2-NEXT:    shlq $4, %rdx
+; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; SSE2-NEXT:    movq %rdx, %rdi
+; SSE2-NEXT:    andq %rcx, %rdi
+; SSE2-NEXT:    shrq $2, %rdx
+; SSE2-NEXT:    andq %rcx, %rdx
+; SSE2-NEXT:    leaq (%rdx,%rdi,4), %rdx
+; SSE2-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; SSE2-NEXT:    movq %rdx, %r8
+; SSE2-NEXT:    andq %rdi, %r8
+; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    andq %rdi, %rdx
+; SSE2-NEXT:    leaq (%rdx,%r8,2), %rdx
+; SSE2-NEXT:    bswapq %rax
+; SSE2-NEXT:    movq %rax, %r8
+; SSE2-NEXT:    shrq $4, %r8
+; SSE2-NEXT:    andq %rsi, %r8
+; SSE2-NEXT:    andq %rsi, %rax
+; SSE2-NEXT:    shlq $4, %rax
+; SSE2-NEXT:    orq %r8, %rax
+; SSE2-NEXT:    movq %rax, %rsi
+; SSE2-NEXT:    andq %rcx, %rsi
+; SSE2-NEXT:    shrq $2, %rax
+; SSE2-NEXT:    andq %rcx, %rax
+; SSE2-NEXT:    leaq (%rax,%rsi,4), %rax
+; SSE2-NEXT:    movq %rax, %rcx
+; SSE2-NEXT:    andq %rdi, %rcx
+; SSE2-NEXT:    shrq %rax
+; SSE2-NEXT:    andq %rdi, %rax
+; SSE2-NEXT:    leaq (%rax,%rcx,2), %rax
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: bitreverse_i128_vector:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    pextrq $1, %xmm0, %rax
+; SSE42-NEXT:    bswapq %rax
+; SSE42-NEXT:    movq %xmm0, %rcx
+; SSE42-NEXT:    movq %rax, %rdx
+; SSE42-NEXT:    shrq $4, %rdx
+; SSE42-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; SSE42-NEXT:    andq %rsi, %rdx
+; SSE42-NEXT:    andq %rsi, %rax
+; SSE42-NEXT:    shlq $4, %rax
+; SSE42-NEXT:    orq %rdx, %rax
+; SSE42-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; SSE42-NEXT:    movq %rax, %rdi
+; SSE42-NEXT:    andq %rdx, %rdi
+; SSE42-NEXT:    shrq $2, %rax
+; SSE42-NEXT:    andq %rdx, %rax
+; SSE42-NEXT:    leaq (%rax,%rdi,4), %rax
+; SSE42-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; SSE42-NEXT:    movq %rax, %r8
+; SSE42-NEXT:    andq %rdi, %r8
+; SSE42-NEXT:    shrq %rax
+; SSE42-NEXT:    andq %rdi, %rax
+; SSE42-NEXT:    leaq (%rax,%r8,2), %rax
+; SSE42-NEXT:    bswapq %rcx
+; SSE42-NEXT:    movq %rcx, %r8
+; SSE42-NEXT:    shrq $4, %r8
+; SSE42-NEXT:    andq %rsi, %r8
+; SSE42-NEXT:    andq %rsi, %rcx
+; SSE42-NEXT:    shlq $4, %rcx
+; SSE42-NEXT:    orq %r8, %rcx
+; SSE42-NEXT:    movq %rcx, %rsi
+; SSE42-NEXT:    andq %rdx, %rsi
+; SSE42-NEXT:    shrq $2, %rcx
+; SSE42-NEXT:    andq %rdx, %rcx
+; SSE42-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; SSE42-NEXT:    movq %rcx, %rdx
+; SSE42-NEXT:    andq %rdi, %rdx
+; SSE42-NEXT:    shrq %rcx
+; SSE42-NEXT:    andq %rdi, %rcx
+; SSE42-NEXT:    leaq (%rcx,%rdx,2), %rdx
+; SSE42-NEXT:    retq
+;
+; AVX2-LABEL: bitreverse_i128_vector:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    bswapq %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    shrq $4, %rdx
+; AVX2-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; AVX2-NEXT:    andq %rsi, %rdx
+; AVX2-NEXT:    andq %rsi, %rax
+; AVX2-NEXT:    shlq $4, %rax
+; AVX2-NEXT:    orq %rdx, %rax
+; AVX2-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX2-NEXT:    movq %rax, %rdi
+; AVX2-NEXT:    andq %rdx, %rdi
+; AVX2-NEXT:    shrq $2, %rax
+; AVX2-NEXT:    andq %rdx, %rax
+; AVX2-NEXT:    leaq (%rax,%rdi,4), %rax
+; AVX2-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    andq %rdi, %r8
+; AVX2-NEXT:    shrq %rax
+; AVX2-NEXT:    andq %rdi, %rax
+; AVX2-NEXT:    leaq (%rax,%r8,2), %rax
+; AVX2-NEXT:    bswapq %rcx
+; AVX2-NEXT:    movq %rcx, %r8
+; AVX2-NEXT:    shrq $4, %r8
+; AVX2-NEXT:    andq %rsi, %r8
+; AVX2-NEXT:    andq %rsi, %rcx
+; AVX2-NEXT:    shlq $4, %rcx
+; AVX2-NEXT:    orq %r8, %rcx
+; AVX2-NEXT:    movq %rcx, %rsi
+; AVX2-NEXT:    andq %rdx, %rsi
+; AVX2-NEXT:    shrq $2, %rcx
+; AVX2-NEXT:    andq %rdx, %rcx
+; AVX2-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    andq %rdi, %rdx
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andq %rdi, %rcx
+; AVX2-NEXT:    leaq (%rcx,%rdx,2), %rdx
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitreverse_i128_vector:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512F-NEXT:    bswapq %rax
+; AVX512F-NEXT:    vmovq %xmm0, %rcx
+; AVX512F-NEXT:    movq %rax, %rdx
+; AVX512F-NEXT:    shrq $4, %rdx
+; AVX512F-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; AVX512F-NEXT:    andq %rsi, %rdx
+; AVX512F-NEXT:    andq %rsi, %rax
+; AVX512F-NEXT:    shlq $4, %rax
+; AVX512F-NEXT:    orq %rdx, %rax
+; AVX512F-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX512F-NEXT:    movq %rax, %rdi
+; AVX512F-NEXT:    andq %rdx, %rdi
+; AVX512F-NEXT:    shrq $2, %rax
+; AVX512F-NEXT:    andq %rdx, %rax
+; AVX512F-NEXT:    leaq (%rax,%rdi,4), %rax
+; AVX512F-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX512F-NEXT:    movq %rax, %r8
+; AVX512F-NEXT:    andq %rdi, %r8
+; AVX512F-NEXT:    shrq %rax
+; AVX512F-NEXT:    andq %rdi, %rax
+; AVX512F-NEXT:    leaq (%rax,%r8,2), %rax
+; AVX512F-NEXT:    bswapq %rcx
+; AVX512F-NEXT:    movq %rcx, %r8
+; AVX512F-NEXT:    shrq $4, %r8
+; AVX512F-NEXT:    andq %rsi, %r8
+; AVX512F-NEXT:    andq %rsi, %rcx
+; AVX512F-NEXT:    shlq $4, %rcx
+; AVX512F-NEXT:    orq %r8, %rcx
+; AVX512F-NEXT:    movq %rcx, %rsi
+; AVX512F-NEXT:    andq %rdx, %rsi
+; AVX512F-NEXT:    shrq $2, %rcx
+; AVX512F-NEXT:    andq %rdx, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; AVX512F-NEXT:    movq %rcx, %rdx
+; AVX512F-NEXT:    andq %rdi, %rdx
+; AVX512F-NEXT:    shrq %rcx
+; AVX512F-NEXT:    andq %rdi, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%rdx,2), %rdx
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: bitreverse_i128_vector:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovq %xmm0, %rcx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VL-NEXT:    bswapq %rax
+; AVX512VL-NEXT:    movq %rax, %rdx
+; AVX512VL-NEXT:    shrq $4, %rdx
+; AVX512VL-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VL-NEXT:    andq %rsi, %rdx
+; AVX512VL-NEXT:    andq %rsi, %rax
+; AVX512VL-NEXT:    shlq $4, %rax
+; AVX512VL-NEXT:    orq %rdx, %rax
+; AVX512VL-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX512VL-NEXT:    movq %rax, %rdi
+; AVX512VL-NEXT:    andq %rdx, %rdi
+; AVX512VL-NEXT:    shrq $2, %rax
+; AVX512VL-NEXT:    andq %rdx, %rax
+; AVX512VL-NEXT:    leaq (%rax,%rdi,4), %rax
+; AVX512VL-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX512VL-NEXT:    movq %rax, %r8
+; AVX512VL-NEXT:    andq %rdi, %r8
+; AVX512VL-NEXT:    shrq %rax
+; AVX512VL-NEXT:    andq %rdi, %rax
+; AVX512VL-NEXT:    leaq (%rax,%r8,2), %rax
+; AVX512VL-NEXT:    bswapq %rcx
+; AVX512VL-NEXT:    movq %rcx, %r8
+; AVX512VL-NEXT:    shrq $4, %r8
+; AVX512VL-NEXT:    andq %rsi, %r8
+; AVX512VL-NEXT:    andq %rsi, %rcx
+; AVX512VL-NEXT:    shlq $4, %rcx
+; AVX512VL-NEXT:    orq %r8, %rcx
+; AVX512VL-NEXT:    movq %rcx, %rsi
+; AVX512VL-NEXT:    andq %rdx, %rsi
+; AVX512VL-NEXT:    shrq $2, %rcx
+; AVX512VL-NEXT:    andq %rdx, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; AVX512VL-NEXT:    movq %rcx, %rdx
+; AVX512VL-NEXT:    andq %rdi, %rdx
+; AVX512VL-NEXT:    shrq %rcx
+; AVX512VL-NEXT:    andq %rdi, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%rdx,2), %rdx
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bitreverse_i128_vector:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rcx
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512VBMI-NEXT:    bswapq %rax
+; AVX512VBMI-NEXT:    movq %rax, %rdx
+; AVX512VBMI-NEXT:    shrq $4, %rdx
+; AVX512VBMI-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VBMI-NEXT:    andq %rsi, %rdx
+; AVX512VBMI-NEXT:    andq %rsi, %rax
+; AVX512VBMI-NEXT:    shlq $4, %rax
+; AVX512VBMI-NEXT:    orq %rdx, %rax
+; AVX512VBMI-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX512VBMI-NEXT:    movq %rax, %rdi
+; AVX512VBMI-NEXT:    andq %rdx, %rdi
+; AVX512VBMI-NEXT:    shrq $2, %rax
+; AVX512VBMI-NEXT:    andq %rdx, %rax
+; AVX512VBMI-NEXT:    leaq (%rax,%rdi,4), %rax
+; AVX512VBMI-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX512VBMI-NEXT:    movq %rax, %r8
+; AVX512VBMI-NEXT:    andq %rdi, %r8
+; AVX512VBMI-NEXT:    shrq %rax
+; AVX512VBMI-NEXT:    andq %rdi, %rax
+; AVX512VBMI-NEXT:    leaq (%rax,%r8,2), %rax
+; AVX512VBMI-NEXT:    bswapq %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %r8
+; AVX512VBMI-NEXT:    shrq $4, %r8
+; AVX512VBMI-NEXT:    andq %rsi, %r8
+; AVX512VBMI-NEXT:    andq %rsi, %rcx
+; AVX512VBMI-NEXT:    shlq $4, %rcx
+; AVX512VBMI-NEXT:    orq %r8, %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %rsi
+; AVX512VBMI-NEXT:    andq %rdx, %rsi
+; AVX512VBMI-NEXT:    shrq $2, %rcx
+; AVX512VBMI-NEXT:    andq %rdx, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %rdx
+; AVX512VBMI-NEXT:    andq %rdi, %rdx
+; AVX512VBMI-NEXT:    shrq %rcx
+; AVX512VBMI-NEXT:    andq %rdi, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%rdx,2), %rdx
+; AVX512VBMI-NEXT:    retq
+  %a0 = bitcast <2 x i64> %v0 to i128
+  %res = call i128 @llvm.bitreverse.i128(i128 %a0)
+  ret i128 %res
+}
+
+define i128 @bitreverse_i128_load(ptr %p0) nounwind {
+; SSE-LABEL: bitreverse_i128_load:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 8(%rdi), %rax
+; SSE-NEXT:    bswapq %rax
+; SSE-NEXT:    movq (%rdi), %rcx
+; SSE-NEXT:    movq %rax, %rdx
+; SSE-NEXT:    shrq $4, %rdx
+; SSE-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; SSE-NEXT:    andq %rsi, %rdx
+; SSE-NEXT:    andq %rsi, %rax
+; SSE-NEXT:    shlq $4, %rax
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; SSE-NEXT:    movq %rax, %rdi
+; SSE-NEXT:    andq %rdx, %rdi
+; SSE-NEXT:    shrq $2, %rax
+; SSE-NEXT:    andq %rdx, %rax
+; SSE-NEXT:    leaq (%rax,%rdi,4), %rax
+; SSE-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; SSE-NEXT:    movq %rax, %r8
+; SSE-NEXT:    andq %rdi, %r8
+; SSE-NEXT:    shrq %rax
+; SSE-NEXT:    andq %rdi, %rax
+; SSE-NEXT:    leaq (%rax,%r8,2), %rax
+; SSE-NEXT:    bswapq %rcx
+; SSE-NEXT:    movq %rcx, %r8
+; SSE-NEXT:    shrq $4, %r8
+; SSE-NEXT:    andq %rsi, %r8
+; SSE-NEXT:    andq %rsi, %rcx
+; SSE-NEXT:    shlq $4, %rcx
+; SSE-NEXT:    orq %r8, %rcx
+; SSE-NEXT:    movq %rcx, %rsi
+; SSE-NEXT:    andq %rdx, %rsi
+; SSE-NEXT:    shrq $2, %rcx
+; SSE-NEXT:    andq %rdx, %rcx
+; SSE-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    andq %rdi, %rdx
+; SSE-NEXT:    shrq %rcx
+; SSE-NEXT:    andq %rdi, %rcx
+; SSE-NEXT:    leaq (%rcx,%rdx,2), %rdx
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: bitreverse_i128_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movbeq 8(%rdi), %rax
+; AVX-NEXT:    movq %rax, %rcx
+; AVX-NEXT:    shrq $4, %rcx
+; AVX-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; AVX-NEXT:    andq %rdx, %rcx
+; AVX-NEXT:    andq %rdx, %rax
+; AVX-NEXT:    shlq $4, %rax
+; AVX-NEXT:    orq %rcx, %rax
+; AVX-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; AVX-NEXT:    movq %rax, %rsi
+; AVX-NEXT:    andq %rcx, %rsi
+; AVX-NEXT:    shrq $2, %rax
+; AVX-NEXT:    andq %rcx, %rax
+; AVX-NEXT:    leaq (%rax,%rsi,4), %rax
+; AVX-NEXT:    movabsq $6148914691236517205, %rsi # imm = 0x5555555555555555
+; AVX-NEXT:    movq %rax, %r8
+; AVX-NEXT:    andq %rsi, %r8
+; AVX-NEXT:    shrq %rax
+; AVX-NEXT:    andq %rsi, %rax
+; AVX-NEXT:    leaq (%rax,%r8,2), %rax
+; AVX-NEXT:    movbeq (%rdi), %rdi
+; AVX-NEXT:    movq %rdi, %r8
+; AVX-NEXT:    shrq $4, %r8
+; AVX-NEXT:    andq %rdx, %r8
+; AVX-NEXT:    andq %rdx, %rdi
+; AVX-NEXT:    shlq $4, %rdi
+; AVX-NEXT:    orq %r8, %rdi
+; AVX-NEXT:    movq %rdi, %rdx
+; AVX-NEXT:    andq %rcx, %rdx
+; AVX-NEXT:    shrq $2, %rdi
+; AVX-NEXT:    andq %rcx, %rdi
+; AVX-NEXT:    leaq (%rdi,%rdx,4), %rcx
+; AVX-NEXT:    movq %rcx, %rdx
+; AVX-NEXT:    andq %rsi, %rdx
+; AVX-NEXT:    shrq %rcx
+; AVX-NEXT:    andq %rsi, %rcx
+; AVX-NEXT:    leaq (%rcx,%rdx,2), %rdx
+; AVX-NEXT:    retq
+  %a0 = load i128, ptr %p0
+  %res = call i128 @llvm.bitreverse.i128(i128 %a0)
+  ret i128 %res
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; AVX512: {{.*}}
diff --git a/llvm/test/CodeGen/X86/bit-manip-i256.ll b/llvm/test/CodeGen/X86/bit-manip-i256.ll
index dba8d0d3dd07f..39e649d13c51e 100644
--- a/llvm/test/CodeGen/X86/bit-manip-i256.ll
+++ b/llvm/test/CodeGen/X86/bit-manip-i256.ll
@@ -3281,6 +3281,1498 @@ define i256 @isolate_msb_i256_load(ptr %p0, i256 %idx) nounwind {
   ret i256 %res
 }
 
+define i256 @bswap_i256(i256 %a0) nounwind {
+; SSE-LABEL: bswap_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    bswapq %r8
+; SSE-NEXT:    bswapq %rcx
+; SSE-NEXT:    bswapq %rdx
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    bswapq %rsi
+; SSE-NEXT:    movq %rsi, 24(%rdi)
+; SSE-NEXT:    movq %rdx, 16(%rdi)
+; SSE-NEXT:    movq %rcx, 8(%rdi)
+; SSE-NEXT:    movq %r8, (%rdi)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: bswap_i256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    movbeq %rsi, 24(%rdi)
+; AVX-NEXT:    movbeq %rdx, 16(%rdi)
+; AVX-NEXT:    movbeq %rcx, 8(%rdi)
+; AVX-NEXT:    movbeq %r8, (%rdi)
+; AVX-NEXT:    retq
+  %res = call i256 @llvm.bswap.i256(i256 %a0)
+  ret i256 %res
+}
+
+define i256 @bswap_i256_vector(<4 x i64> %v0) nounwind {
+; SSE2-LABEL: bswap_i256_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm2, %rcx
+; SSE2-NEXT:    movq %xmm1, %rdx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rsi
+; SSE2-NEXT:    bswapq %rsi
+; SSE2-NEXT:    bswapq %rdx
+; SSE2-NEXT:    bswapq %rcx
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    bswapq %rdi
+; SSE2-NEXT:    movq %rdi, 24(%rax)
+; SSE2-NEXT:    movq %rcx, 16(%rax)
+; SSE2-NEXT:    movq %rdx, 8(%rax)
+; SSE2-NEXT:    movq %rsi, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: bswap_i256_vector:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE42-NEXT:    movq %xmm1, %rdx
+; SSE42-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE42-NEXT:    bswapq %rsi
+; SSE42-NEXT:    bswapq %rdx
+; SSE42-NEXT:    bswapq %rcx
+; SSE42-NEXT:    movq %xmm0, %rdi
+; SSE42-NEXT:    bswapq %rdi
+; SSE42-NEXT:    movq %rdi, 24(%rax)
+; SSE42-NEXT:    movq %rcx, 16(%rax)
+; SSE42-NEXT:    movq %rdx, 8(%rax)
+; SSE42-NEXT:    movq %rsi, (%rax)
+; SSE42-NEXT:    retq
+;
+; AVX2-LABEL: bswap_i256_vector:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    vmovq %xmm0, %rsi
+; AVX2-NEXT:    movbeq %rsi, 24(%rdi)
+; AVX2-NEXT:    movbeq %rdx, 16(%rdi)
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    movbeq %rdx, 8(%rdi)
+; AVX2-NEXT:    movbeq %rcx, (%rdi)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bswap_i256_vector:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rsi
+; AVX512F-NEXT:    movbeq %rsi, 24(%rdi)
+; AVX512F-NEXT:    movbeq %rdx, 16(%rdi)
+; AVX512F-NEXT:    vmovq %xmm1, %rdx
+; AVX512F-NEXT:    movbeq %rdx, 8(%rdi)
+; AVX512F-NEXT:    movbeq %rcx, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: bswap_i256_vector:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512VL-NEXT:    vmovq %xmm1, %rdx
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    vmovq %xmm0, %rdi
+; AVX512VL-NEXT:    movbeq %rdi, 24(%rax)
+; AVX512VL-NEXT:    movbeq %rsi, 16(%rax)
+; AVX512VL-NEXT:    movbeq %rdx, 8(%rax)
+; AVX512VL-NEXT:    movbeq %rcx, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bswap_i256_vector:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512VBMI-NEXT:    vmovq %xmm1, %rdx
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rdi
+; AVX512VBMI-NEXT:    movbeq %rdi, 24(%rax)
+; AVX512VBMI-NEXT:    movbeq %rsi, 16(%rax)
+; AVX512VBMI-NEXT:    movbeq %rdx, 8(%rax)
+; AVX512VBMI-NEXT:    movbeq %rcx, (%rax)
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %a0 = bitcast <4 x i64> %v0 to i256
+  %res = call i256 @llvm.bswap.i256(i256 %a0)
+  ret i256 %res
+}
+
+define i256 @bswap_i256_load(ptr %p0) nounwind {
+; SSE-LABEL: bswap_i256_load:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq 8(%rsi), %rcx
+; SSE-NEXT:    movq 16(%rsi), %rdx
+; SSE-NEXT:    movq 24(%rsi), %rdi
+; SSE-NEXT:    bswapq %rdi
+; SSE-NEXT:    bswapq %rdx
+; SSE-NEXT:    bswapq %rcx
+; SSE-NEXT:    movq (%rsi), %rsi
+; SSE-NEXT:    bswapq %rsi
+; SSE-NEXT:    movq %rsi, 24(%rax)
+; SSE-NEXT:    movq %rcx, 16(%rax)
+; SSE-NEXT:    movq %rdx, 8(%rax)
+; SSE-NEXT:    movq %rdi, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: bswap_i256_load:
+; AVX:       # %bb.0:
+; AVX-NEXT:    movq %rdi, %rax
+; AVX-NEXT:    movq 24(%rsi), %rcx
+; AVX-NEXT:    movq 16(%rsi), %rdx
+; AVX-NEXT:    movq (%rsi), %rdi
+; AVX-NEXT:    movq 8(%rsi), %rsi
+; AVX-NEXT:    movbeq %rdi, 24(%rax)
+; AVX-NEXT:    movbeq %rsi, 16(%rax)
+; AVX-NEXT:    movbeq %rdx, 8(%rax)
+; AVX-NEXT:    movbeq %rcx, (%rax)
+; AVX-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %res = call i256 @llvm.bswap.i256(i256 %a0)
+  ret i256 %res
+}
+
+define i256 @bitreverse_i256(i256 %a0) nounwind {
+; SSE-LABEL: bitreverse_i256:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    bswapq %r8
+; SSE-NEXT:    movq %r8, %rdi
+; SSE-NEXT:    shrq $4, %rdi
+; SSE-NEXT:    movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
+; SSE-NEXT:    andq %r9, %rdi
+; SSE-NEXT:    andq %r9, %r8
+; SSE-NEXT:    shlq $4, %r8
+; SSE-NEXT:    orq %rdi, %r8
+; SSE-NEXT:    movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; SSE-NEXT:    movq %r8, %r10
+; SSE-NEXT:    andq %rdi, %r10
+; SSE-NEXT:    shrq $2, %r8
+; SSE-NEXT:    andq %rdi, %r8
+; SSE-NEXT:    leaq (%r8,%r10,4), %r10
+; SSE-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; SSE-NEXT:    movq %r10, %r11
+; SSE-NEXT:    andq %r8, %r11
+; SSE-NEXT:    shrq %r10
+; SSE-NEXT:    andq %r8, %r10
+; SSE-NEXT:    leaq (%r10,%r11,2), %r10
+; SSE-NEXT:    bswapq %rcx
+; SSE-NEXT:    movq %rcx, %r11
+; SSE-NEXT:    shrq $4, %r11
+; SSE-NEXT:    andq %r9, %r11
+; SSE-NEXT:    andq %r9, %rcx
+; SSE-NEXT:    shlq $4, %rcx
+; SSE-NEXT:    orq %r11, %rcx
+; SSE-NEXT:    movq %rcx, %r11
+; SSE-NEXT:    andq %rdi, %r11
+; SSE-NEXT:    shrq $2, %rcx
+; SSE-NEXT:    andq %rdi, %rcx
+; SSE-NEXT:    leaq (%rcx,%r11,4), %rcx
+; SSE-NEXT:    movq %rcx, %r11
+; SSE-NEXT:    andq %r8, %r11
+; SSE-NEXT:    shrq %rcx
+; SSE-NEXT:    andq %r8, %rcx
+; SSE-NEXT:    bswapq %rdx
+; SSE-NEXT:    leaq (%rcx,%r11,2), %rcx
+; SSE-NEXT:    movq %rdx, %r11
+; SSE-NEXT:    shrq $4, %r11
+; SSE-NEXT:    andq %r9, %r11
+; SSE-NEXT:    andq %r9, %rdx
+; SSE-NEXT:    shlq $4, %rdx
+; SSE-NEXT:    orq %r11, %rdx
+; SSE-NEXT:    movq %rdx, %r11
+; SSE-NEXT:    andq %rdi, %r11
+; SSE-NEXT:    shrq $2, %rdx
+; SSE-NEXT:    andq %rdi, %rdx
+; SSE-NEXT:    leaq (%rdx,%r11,4), %rdx
+; SSE-NEXT:    movq %rdx, %r11
+; SSE-NEXT:    andq %r8, %r11
+; SSE-NEXT:    shrq %rdx
+; SSE-NEXT:    andq %r8, %rdx
+; SSE-NEXT:    leaq (%rdx,%r11,2), %rdx
+; SSE-NEXT:    bswapq %rsi
+; SSE-NEXT:    movq %rsi, %r11
+; SSE-NEXT:    shrq $4, %r11
+; SSE-NEXT:    andq %r9, %r11
+; SSE-NEXT:    andq %r9, %rsi
+; SSE-NEXT:    shlq $4, %rsi
+; SSE-NEXT:    orq %r11, %rsi
+; SSE-NEXT:    movq %rsi, %r9
+; SSE-NEXT:    andq %rdi, %r9
+; SSE-NEXT:    shrq $2, %rsi
+; SSE-NEXT:    andq %rdi, %rsi
+; SSE-NEXT:    leaq (%rsi,%r9,4), %rsi
+; SSE-NEXT:    movq %rsi, %rdi
+; SSE-NEXT:    andq %r8, %rdi
+; SSE-NEXT:    shrq %rsi
+; SSE-NEXT:    andq %r8, %rsi
+; SSE-NEXT:    leaq (%rsi,%rdi,2), %rsi
+; SSE-NEXT:    movq %rsi, 24(%rax)
+; SSE-NEXT:    movq %rdx, 16(%rax)
+; SSE-NEXT:    movq %rcx, 8(%rax)
+; SSE-NEXT:    movq %r10, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: bitreverse_i256:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    bswapq %r8
+; AVX2-NEXT:    movq %r8, %rdi
+; AVX2-NEXT:    shrq $4, %rdi
+; AVX2-NEXT:    movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
+; AVX2-NEXT:    andq %r9, %rdi
+; AVX2-NEXT:    andq %r9, %r8
+; AVX2-NEXT:    shlq $4, %r8
+; AVX2-NEXT:    orq %rdi, %r8
+; AVX2-NEXT:    movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; AVX2-NEXT:    movq %r8, %r10
+; AVX2-NEXT:    andq %rdi, %r10
+; AVX2-NEXT:    shrq $2, %r8
+; AVX2-NEXT:    andq %rdi, %r8
+; AVX2-NEXT:    leaq (%r8,%r10,4), %r10
+; AVX2-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; AVX2-NEXT:    movq %r10, %r11
+; AVX2-NEXT:    andq %r8, %r11
+; AVX2-NEXT:    shrq %r10
+; AVX2-NEXT:    andq %r8, %r10
+; AVX2-NEXT:    bswapq %rcx
+; AVX2-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX2-NEXT:    movq %rcx, %r11
+; AVX2-NEXT:    shrq $4, %r11
+; AVX2-NEXT:    andq %r9, %r11
+; AVX2-NEXT:    andq %r9, %rcx
+; AVX2-NEXT:    shlq $4, %rcx
+; AVX2-NEXT:    orq %r11, %rcx
+; AVX2-NEXT:    movq %rcx, %r11
+; AVX2-NEXT:    andq %rdi, %r11
+; AVX2-NEXT:    shrq $2, %rcx
+; AVX2-NEXT:    andq %rdi, %rcx
+; AVX2-NEXT:    leaq (%rcx,%r11,4), %rcx
+; AVX2-NEXT:    movq %rcx, %r11
+; AVX2-NEXT:    andq %r8, %r11
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andq %r8, %rcx
+; AVX2-NEXT:    leaq (%rcx,%r11,2), %rcx
+; AVX2-NEXT:    bswapq %rdx
+; AVX2-NEXT:    movq %rdx, %r11
+; AVX2-NEXT:    shrq $4, %r11
+; AVX2-NEXT:    andq %r9, %r11
+; AVX2-NEXT:    andq %r9, %rdx
+; AVX2-NEXT:    shlq $4, %rdx
+; AVX2-NEXT:    orq %r11, %rdx
+; AVX2-NEXT:    movq %rdx, %r11
+; AVX2-NEXT:    andq %rdi, %r11
+; AVX2-NEXT:    shrq $2, %rdx
+; AVX2-NEXT:    andq %rdi, %rdx
+; AVX2-NEXT:    leaq (%rdx,%r11,4), %rdx
+; AVX2-NEXT:    movq %rdx, %r11
+; AVX2-NEXT:    andq %r8, %r11
+; AVX2-NEXT:    shrq %rdx
+; AVX2-NEXT:    andq %r8, %rdx
+; AVX2-NEXT:    leaq (%rdx,%r11,2), %rdx
+; AVX2-NEXT:    bswapq %rsi
+; AVX2-NEXT:    movq %rsi, %r11
+; AVX2-NEXT:    shrq $4, %r11
+; AVX2-NEXT:    andq %r9, %r11
+; AVX2-NEXT:    andq %r9, %rsi
+; AVX2-NEXT:    shlq $4, %rsi
+; AVX2-NEXT:    orq %r11, %rsi
+; AVX2-NEXT:    movq %rsi, %r9
+; AVX2-NEXT:    andq %rdi, %r9
+; AVX2-NEXT:    shrq $2, %rsi
+; AVX2-NEXT:    andq %rdi, %rsi
+; AVX2-NEXT:    leaq (%rsi,%r9,4), %rsi
+; AVX2-NEXT:    movq %rsi, %rdi
+; AVX2-NEXT:    andq %r8, %rdi
+; AVX2-NEXT:    shrq %rsi
+; AVX2-NEXT:    andq %r8, %rsi
+; AVX2-NEXT:    leaq (%rsi,%rdi,2), %rsi
+; AVX2-NEXT:    movq %rsi, 24(%rax)
+; AVX2-NEXT:    movq %rdx, 16(%rax)
+; AVX2-NEXT:    movq %rcx, 8(%rax)
+; AVX2-NEXT:    movq %r10, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitreverse_i256:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    bswapq %r8
+; AVX512F-NEXT:    movq %r8, %rdi
+; AVX512F-NEXT:    shrq $4, %rdi
+; AVX512F-NEXT:    movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
+; AVX512F-NEXT:    andq %r9, %rdi
+; AVX512F-NEXT:    andq %r9, %r8
+; AVX512F-NEXT:    shlq $4, %r8
+; AVX512F-NEXT:    orq %rdi, %r8
+; AVX512F-NEXT:    movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; AVX512F-NEXT:    movq %r8, %r10
+; AVX512F-NEXT:    andq %rdi, %r10
+; AVX512F-NEXT:    shrq $2, %r8
+; AVX512F-NEXT:    andq %rdi, %r8
+; AVX512F-NEXT:    leaq (%r8,%r10,4), %r10
+; AVX512F-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; AVX512F-NEXT:    movq %r10, %r11
+; AVX512F-NEXT:    andq %r8, %r11
+; AVX512F-NEXT:    shrq %r10
+; AVX512F-NEXT:    andq %r8, %r10
+; AVX512F-NEXT:    bswapq %rcx
+; AVX512F-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512F-NEXT:    movq %rcx, %r11
+; AVX512F-NEXT:    shrq $4, %r11
+; AVX512F-NEXT:    andq %r9, %r11
+; AVX512F-NEXT:    andq %r9, %rcx
+; AVX512F-NEXT:    shlq $4, %rcx
+; AVX512F-NEXT:    orq %r11, %rcx
+; AVX512F-NEXT:    movq %rcx, %r11
+; AVX512F-NEXT:    andq %rdi, %r11
+; AVX512F-NEXT:    shrq $2, %rcx
+; AVX512F-NEXT:    andq %rdi, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%r11,4), %rcx
+; AVX512F-NEXT:    movq %rcx, %r11
+; AVX512F-NEXT:    andq %r8, %r11
+; AVX512F-NEXT:    shrq %rcx
+; AVX512F-NEXT:    andq %r8, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%r11,2), %rcx
+; AVX512F-NEXT:    bswapq %rdx
+; AVX512F-NEXT:    movq %rdx, %r11
+; AVX512F-NEXT:    shrq $4, %r11
+; AVX512F-NEXT:    andq %r9, %r11
+; AVX512F-NEXT:    andq %r9, %rdx
+; AVX512F-NEXT:    shlq $4, %rdx
+; AVX512F-NEXT:    orq %r11, %rdx
+; AVX512F-NEXT:    movq %rdx, %r11
+; AVX512F-NEXT:    andq %rdi, %r11
+; AVX512F-NEXT:    shrq $2, %rdx
+; AVX512F-NEXT:    andq %rdi, %rdx
+; AVX512F-NEXT:    leaq (%rdx,%r11,4), %rdx
+; AVX512F-NEXT:    movq %rdx, %r11
+; AVX512F-NEXT:    andq %r8, %r11
+; AVX512F-NEXT:    shrq %rdx
+; AVX512F-NEXT:    andq %r8, %rdx
+; AVX512F-NEXT:    leaq (%rdx,%r11,2), %rdx
+; AVX512F-NEXT:    bswapq %rsi
+; AVX512F-NEXT:    movq %rsi, %r11
+; AVX512F-NEXT:    shrq $4, %r11
+; AVX512F-NEXT:    andq %r9, %r11
+; AVX512F-NEXT:    andq %r9, %rsi
+; AVX512F-NEXT:    shlq $4, %rsi
+; AVX512F-NEXT:    orq %r11, %rsi
+; AVX512F-NEXT:    movq %rsi, %r9
+; AVX512F-NEXT:    andq %rdi, %r9
+; AVX512F-NEXT:    shrq $2, %rsi
+; AVX512F-NEXT:    andq %rdi, %rsi
+; AVX512F-NEXT:    leaq (%rsi,%r9,4), %rsi
+; AVX512F-NEXT:    movq %rsi, %rdi
+; AVX512F-NEXT:    andq %r8, %rdi
+; AVX512F-NEXT:    shrq %rsi
+; AVX512F-NEXT:    andq %r8, %rsi
+; AVX512F-NEXT:    leaq (%rsi,%rdi,2), %rsi
+; AVX512F-NEXT:    movq %rsi, 24(%rax)
+; AVX512F-NEXT:    movq %rdx, 16(%rax)
+; AVX512F-NEXT:    movq %rcx, 8(%rax)
+; AVX512F-NEXT:    movq %r10, (%rax)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: bitreverse_i256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    bswapq %r8
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    movq %r8, %rdi
+; AVX512VL-NEXT:    shrq $4, %rdi
+; AVX512VL-NEXT:    movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VL-NEXT:    andq %r9, %rdi
+; AVX512VL-NEXT:    andq %r9, %r8
+; AVX512VL-NEXT:    shlq $4, %r8
+; AVX512VL-NEXT:    orq %rdi, %r8
+; AVX512VL-NEXT:    movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; AVX512VL-NEXT:    movq %r8, %r10
+; AVX512VL-NEXT:    andq %rdi, %r10
+; AVX512VL-NEXT:    shrq $2, %r8
+; AVX512VL-NEXT:    andq %rdi, %r8
+; AVX512VL-NEXT:    leaq (%r8,%r10,4), %r10
+; AVX512VL-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    andq %r8, %r11
+; AVX512VL-NEXT:    shrq %r10
+; AVX512VL-NEXT:    andq %r8, %r10
+; AVX512VL-NEXT:    bswapq %rcx
+; AVX512VL-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512VL-NEXT:    movq %rcx, %r11
+; AVX512VL-NEXT:    shrq $4, %r11
+; AVX512VL-NEXT:    andq %r9, %r11
+; AVX512VL-NEXT:    andq %r9, %rcx
+; AVX512VL-NEXT:    shlq $4, %rcx
+; AVX512VL-NEXT:    orq %r11, %rcx
+; AVX512VL-NEXT:    movq %rcx, %r11
+; AVX512VL-NEXT:    andq %rdi, %r11
+; AVX512VL-NEXT:    shrq $2, %rcx
+; AVX512VL-NEXT:    andq %rdi, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%r11,4), %rcx
+; AVX512VL-NEXT:    movq %rcx, %r11
+; AVX512VL-NEXT:    andq %r8, %r11
+; AVX512VL-NEXT:    shrq %rcx
+; AVX512VL-NEXT:    andq %r8, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%r11,2), %rcx
+; AVX512VL-NEXT:    bswapq %rdx
+; AVX512VL-NEXT:    movq %rdx, %r11
+; AVX512VL-NEXT:    shrq $4, %r11
+; AVX512VL-NEXT:    andq %r9, %r11
+; AVX512VL-NEXT:    andq %r9, %rdx
+; AVX512VL-NEXT:    shlq $4, %rdx
+; AVX512VL-NEXT:    orq %r11, %rdx
+; AVX512VL-NEXT:    movq %rdx, %r11
+; AVX512VL-NEXT:    andq %rdi, %r11
+; AVX512VL-NEXT:    shrq $2, %rdx
+; AVX512VL-NEXT:    andq %rdi, %rdx
+; AVX512VL-NEXT:    leaq (%rdx,%r11,4), %rdx
+; AVX512VL-NEXT:    movq %rdx, %r11
+; AVX512VL-NEXT:    andq %r8, %r11
+; AVX512VL-NEXT:    shrq %rdx
+; AVX512VL-NEXT:    andq %r8, %rdx
+; AVX512VL-NEXT:    bswapq %rsi
+; AVX512VL-NEXT:    leaq (%rdx,%r11,2), %rdx
+; AVX512VL-NEXT:    movq %rsi, %r11
+; AVX512VL-NEXT:    shrq $4, %r11
+; AVX512VL-NEXT:    andq %r9, %r11
+; AVX512VL-NEXT:    andq %r9, %rsi
+; AVX512VL-NEXT:    shlq $4, %rsi
+; AVX512VL-NEXT:    orq %r11, %rsi
+; AVX512VL-NEXT:    movq %rsi, %r9
+; AVX512VL-NEXT:    andq %rdi, %r9
+; AVX512VL-NEXT:    shrq $2, %rsi
+; AVX512VL-NEXT:    andq %rdi, %rsi
+; AVX512VL-NEXT:    leaq (%rsi,%r9,4), %rsi
+; AVX512VL-NEXT:    movq %rsi, %rdi
+; AVX512VL-NEXT:    andq %r8, %rdi
+; AVX512VL-NEXT:    shrq %rsi
+; AVX512VL-NEXT:    andq %r8, %rsi
+; AVX512VL-NEXT:    leaq (%rsi,%rdi,2), %rsi
+; AVX512VL-NEXT:    movq %rsi, 24(%rax)
+; AVX512VL-NEXT:    movq %rdx, 16(%rax)
+; AVX512VL-NEXT:    movq %rcx, 8(%rax)
+; AVX512VL-NEXT:    movq %r10, (%rax)
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bitreverse_i256:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    bswapq %r8
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movq %r8, %rdi
+; AVX512VBMI-NEXT:    shrq $4, %rdi
+; AVX512VBMI-NEXT:    movabsq $1085102592571150095, %r9 # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VBMI-NEXT:    andq %r9, %rdi
+; AVX512VBMI-NEXT:    andq %r9, %r8
+; AVX512VBMI-NEXT:    shlq $4, %r8
+; AVX512VBMI-NEXT:    orq %rdi, %r8
+; AVX512VBMI-NEXT:    movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; AVX512VBMI-NEXT:    movq %r8, %r10
+; AVX512VBMI-NEXT:    andq %rdi, %r10
+; AVX512VBMI-NEXT:    shrq $2, %r8
+; AVX512VBMI-NEXT:    andq %rdi, %r8
+; AVX512VBMI-NEXT:    leaq (%r8,%r10,4), %r10
+; AVX512VBMI-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    andq %r8, %r11
+; AVX512VBMI-NEXT:    shrq %r10
+; AVX512VBMI-NEXT:    andq %r8, %r10
+; AVX512VBMI-NEXT:    bswapq %rcx
+; AVX512VBMI-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512VBMI-NEXT:    movq %rcx, %r11
+; AVX512VBMI-NEXT:    shrq $4, %r11
+; AVX512VBMI-NEXT:    andq %r9, %r11
+; AVX512VBMI-NEXT:    andq %r9, %rcx
+; AVX512VBMI-NEXT:    shlq $4, %rcx
+; AVX512VBMI-NEXT:    orq %r11, %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %r11
+; AVX512VBMI-NEXT:    andq %rdi, %r11
+; AVX512VBMI-NEXT:    shrq $2, %rcx
+; AVX512VBMI-NEXT:    andq %rdi, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%r11,4), %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %r11
+; AVX512VBMI-NEXT:    andq %r8, %r11
+; AVX512VBMI-NEXT:    shrq %rcx
+; AVX512VBMI-NEXT:    andq %r8, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%r11,2), %rcx
+; AVX512VBMI-NEXT:    bswapq %rdx
+; AVX512VBMI-NEXT:    movq %rdx, %r11
+; AVX512VBMI-NEXT:    shrq $4, %r11
+; AVX512VBMI-NEXT:    andq %r9, %r11
+; AVX512VBMI-NEXT:    andq %r9, %rdx
+; AVX512VBMI-NEXT:    shlq $4, %rdx
+; AVX512VBMI-NEXT:    orq %r11, %rdx
+; AVX512VBMI-NEXT:    movq %rdx, %r11
+; AVX512VBMI-NEXT:    andq %rdi, %r11
+; AVX512VBMI-NEXT:    shrq $2, %rdx
+; AVX512VBMI-NEXT:    andq %rdi, %rdx
+; AVX512VBMI-NEXT:    leaq (%rdx,%r11,4), %rdx
+; AVX512VBMI-NEXT:    movq %rdx, %r11
+; AVX512VBMI-NEXT:    andq %r8, %r11
+; AVX512VBMI-NEXT:    shrq %rdx
+; AVX512VBMI-NEXT:    andq %r8, %rdx
+; AVX512VBMI-NEXT:    bswapq %rsi
+; AVX512VBMI-NEXT:    leaq (%rdx,%r11,2), %rdx
+; AVX512VBMI-NEXT:    movq %rsi, %r11
+; AVX512VBMI-NEXT:    shrq $4, %r11
+; AVX512VBMI-NEXT:    andq %r9, %r11
+; AVX512VBMI-NEXT:    andq %r9, %rsi
+; AVX512VBMI-NEXT:    shlq $4, %rsi
+; AVX512VBMI-NEXT:    orq %r11, %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %r9
+; AVX512VBMI-NEXT:    andq %rdi, %r9
+; AVX512VBMI-NEXT:    shrq $2, %rsi
+; AVX512VBMI-NEXT:    andq %rdi, %rsi
+; AVX512VBMI-NEXT:    leaq (%rsi,%r9,4), %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %rdi
+; AVX512VBMI-NEXT:    andq %r8, %rdi
+; AVX512VBMI-NEXT:    shrq %rsi
+; AVX512VBMI-NEXT:    andq %r8, %rsi
+; AVX512VBMI-NEXT:    leaq (%rsi,%rdi,2), %rsi
+; AVX512VBMI-NEXT:    movq %rsi, 24(%rax)
+; AVX512VBMI-NEXT:    movq %rdx, 16(%rax)
+; AVX512VBMI-NEXT:    movq %rcx, 8(%rax)
+; AVX512VBMI-NEXT:    movq %r10, (%rax)
+; AVX512VBMI-NEXT:    retq
+  %res = call i256 @llvm.bitreverse.i256(i256 %a0)
+  ret i256 %res
+}
+
+define i256 @bitreverse_i256_vector(<4 x i64> %v0) nounwind {
+; SSE2-LABEL: bitreverse_i256_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movq %xmm1, %r9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdi
+; SSE2-NEXT:    bswapq %rdi
+; SSE2-NEXT:    movq %rdi, %rdx
+; SSE2-NEXT:    shrq $4, %rdx
+; SSE2-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; SSE2-NEXT:    andq %rsi, %rdx
+; SSE2-NEXT:    andq %rsi, %rdi
+; SSE2-NEXT:    shlq $4, %rdi
+; SSE2-NEXT:    orq %rdx, %rdi
+; SSE2-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; SSE2-NEXT:    movq %rdi, %r8
+; SSE2-NEXT:    andq %rdx, %r8
+; SSE2-NEXT:    shrq $2, %rdi
+; SSE2-NEXT:    andq %rdx, %rdi
+; SSE2-NEXT:    leaq (%rdi,%r8,4), %r8
+; SSE2-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; SSE2-NEXT:    movq %r8, %r11
+; SSE2-NEXT:    andq %rdi, %r11
+; SSE2-NEXT:    shrq %r8
+; SSE2-NEXT:    andq %rdi, %r8
+; SSE2-NEXT:    leaq (%r8,%r11,2), %r8
+; SSE2-NEXT:    bswapq %r10
+; SSE2-NEXT:    movq %r10, %r11
+; SSE2-NEXT:    shrq $4, %r11
+; SSE2-NEXT:    andq %rsi, %r11
+; SSE2-NEXT:    andq %rsi, %r10
+; SSE2-NEXT:    shlq $4, %r10
+; SSE2-NEXT:    orq %r11, %r10
+; SSE2-NEXT:    movq %r10, %r11
+; SSE2-NEXT:    andq %rdx, %r11
+; SSE2-NEXT:    shrq $2, %r10
+; SSE2-NEXT:    andq %rdx, %r10
+; SSE2-NEXT:    leaq (%r10,%r11,4), %r10
+; SSE2-NEXT:    movq %r10, %r11
+; SSE2-NEXT:    andq %rdi, %r11
+; SSE2-NEXT:    shrq %r10
+; SSE2-NEXT:    andq %rdi, %r10
+; SSE2-NEXT:    bswapq %r9
+; SSE2-NEXT:    leaq (%r10,%r11,2), %r10
+; SSE2-NEXT:    movq %r9, %r11
+; SSE2-NEXT:    shrq $4, %r11
+; SSE2-NEXT:    andq %rsi, %r11
+; SSE2-NEXT:    andq %rsi, %r9
+; SSE2-NEXT:    shlq $4, %r9
+; SSE2-NEXT:    orq %r11, %r9
+; SSE2-NEXT:    movq %r9, %r11
+; SSE2-NEXT:    andq %rdx, %r11
+; SSE2-NEXT:    shrq $2, %r9
+; SSE2-NEXT:    andq %rdx, %r9
+; SSE2-NEXT:    leaq (%r9,%r11,4), %r9
+; SSE2-NEXT:    movq %r9, %r11
+; SSE2-NEXT:    andq %rdi, %r11
+; SSE2-NEXT:    shrq %r9
+; SSE2-NEXT:    andq %rdi, %r9
+; SSE2-NEXT:    leaq (%r9,%r11,2), %r9
+; SSE2-NEXT:    bswapq %rcx
+; SSE2-NEXT:    movq %rcx, %r11
+; SSE2-NEXT:    shrq $4, %r11
+; SSE2-NEXT:    andq %rsi, %r11
+; SSE2-NEXT:    andq %rsi, %rcx
+; SSE2-NEXT:    shlq $4, %rcx
+; SSE2-NEXT:    orq %r11, %rcx
+; SSE2-NEXT:    movq %rcx, %rsi
+; SSE2-NEXT:    andq %rdx, %rsi
+; SSE2-NEXT:    shrq $2, %rcx
+; SSE2-NEXT:    andq %rdx, %rcx
+; SSE2-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; SSE2-NEXT:    movq %rcx, %rdx
+; SSE2-NEXT:    andq %rdi, %rdx
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andq %rdi, %rcx
+; SSE2-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; SSE2-NEXT:    movq %rcx, 24(%rax)
+; SSE2-NEXT:    movq %r9, 8(%rax)
+; SSE2-NEXT:    movq %r10, 16(%rax)
+; SSE2-NEXT:    movq %r8, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: bitreverse_i256_vector:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    pextrq $1, %xmm0, %r9
+; SSE42-NEXT:    movq %xmm0, %rcx
+; SSE42-NEXT:    movq %xmm1, %r10
+; SSE42-NEXT:    pextrq $1, %xmm1, %rdi
+; SSE42-NEXT:    bswapq %rdi
+; SSE42-NEXT:    movq %rdi, %rdx
+; SSE42-NEXT:    shrq $4, %rdx
+; SSE42-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; SSE42-NEXT:    andq %rsi, %rdx
+; SSE42-NEXT:    andq %rsi, %rdi
+; SSE42-NEXT:    shlq $4, %rdi
+; SSE42-NEXT:    orq %rdx, %rdi
+; SSE42-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; SSE42-NEXT:    movq %rdi, %r8
+; SSE42-NEXT:    andq %rdx, %r8
+; SSE42-NEXT:    shrq $2, %rdi
+; SSE42-NEXT:    andq %rdx, %rdi
+; SSE42-NEXT:    leaq (%rdi,%r8,4), %r8
+; SSE42-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; SSE42-NEXT:    movq %r8, %r11
+; SSE42-NEXT:    andq %rdi, %r11
+; SSE42-NEXT:    shrq %r8
+; SSE42-NEXT:    andq %rdi, %r8
+; SSE42-NEXT:    leaq (%r8,%r11,2), %r8
+; SSE42-NEXT:    bswapq %r10
+; SSE42-NEXT:    movq %r10, %r11
+; SSE42-NEXT:    shrq $4, %r11
+; SSE42-NEXT:    andq %rsi, %r11
+; SSE42-NEXT:    andq %rsi, %r10
+; SSE42-NEXT:    shlq $4, %r10
+; SSE42-NEXT:    orq %r11, %r10
+; SSE42-NEXT:    movq %r10, %r11
+; SSE42-NEXT:    andq %rdx, %r11
+; SSE42-NEXT:    shrq $2, %r10
+; SSE42-NEXT:    andq %rdx, %r10
+; SSE42-NEXT:    leaq (%r10,%r11,4), %r10
+; SSE42-NEXT:    movq %r10, %r11
+; SSE42-NEXT:    andq %rdi, %r11
+; SSE42-NEXT:    shrq %r10
+; SSE42-NEXT:    andq %rdi, %r10
+; SSE42-NEXT:    bswapq %r9
+; SSE42-NEXT:    leaq (%r10,%r11,2), %r10
+; SSE42-NEXT:    movq %r9, %r11
+; SSE42-NEXT:    shrq $4, %r11
+; SSE42-NEXT:    andq %rsi, %r11
+; SSE42-NEXT:    andq %rsi, %r9
+; SSE42-NEXT:    shlq $4, %r9
+; SSE42-NEXT:    orq %r11, %r9
+; SSE42-NEXT:    movq %r9, %r11
+; SSE42-NEXT:    andq %rdx, %r11
+; SSE42-NEXT:    shrq $2, %r9
+; SSE42-NEXT:    andq %rdx, %r9
+; SSE42-NEXT:    leaq (%r9,%r11,4), %r9
+; SSE42-NEXT:    movq %r9, %r11
+; SSE42-NEXT:    andq %rdi, %r11
+; SSE42-NEXT:    shrq %r9
+; SSE42-NEXT:    andq %rdi, %r9
+; SSE42-NEXT:    leaq (%r9,%r11,2), %r9
+; SSE42-NEXT:    bswapq %rcx
+; SSE42-NEXT:    movq %rcx, %r11
+; SSE42-NEXT:    shrq $4, %r11
+; SSE42-NEXT:    andq %rsi, %r11
+; SSE42-NEXT:    andq %rsi, %rcx
+; SSE42-NEXT:    shlq $4, %rcx
+; SSE42-NEXT:    orq %r11, %rcx
+; SSE42-NEXT:    movq %rcx, %rsi
+; SSE42-NEXT:    andq %rdx, %rsi
+; SSE42-NEXT:    shrq $2, %rcx
+; SSE42-NEXT:    andq %rdx, %rcx
+; SSE42-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; SSE42-NEXT:    movq %rcx, %rdx
+; SSE42-NEXT:    andq %rdi, %rdx
+; SSE42-NEXT:    shrq %rcx
+; SSE42-NEXT:    andq %rdi, %rcx
+; SSE42-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; SSE42-NEXT:    movq %rcx, 24(%rax)
+; SSE42-NEXT:    movq %r9, 16(%rax)
+; SSE42-NEXT:    movq %r10, 8(%rax)
+; SSE42-NEXT:    movq %r8, (%rax)
+; SSE42-NEXT:    retq
+;
+; AVX2-LABEL: bitreverse_i256_vector:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    vmovq %xmm0, %r10
+; AVX2-NEXT:    bswapq %rdi
+; AVX2-NEXT:    movq %rdi, %rdx
+; AVX2-NEXT:    shrq $4, %rdx
+; AVX2-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; AVX2-NEXT:    andq %rsi, %rdx
+; AVX2-NEXT:    andq %rsi, %rdi
+; AVX2-NEXT:    shlq $4, %rdi
+; AVX2-NEXT:    orq %rdx, %rdi
+; AVX2-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX2-NEXT:    movq %rdi, %r8
+; AVX2-NEXT:    andq %rdx, %r8
+; AVX2-NEXT:    shrq $2, %rdi
+; AVX2-NEXT:    andq %rdx, %rdi
+; AVX2-NEXT:    leaq (%rdi,%r8,4), %r8
+; AVX2-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX2-NEXT:    movq %r8, %r11
+; AVX2-NEXT:    andq %rdi, %r11
+; AVX2-NEXT:    shrq %r8
+; AVX2-NEXT:    andq %rdi, %r8
+; AVX2-NEXT:    bswapq %r10
+; AVX2-NEXT:    leaq (%r8,%r11,2), %r8
+; AVX2-NEXT:    movq %r10, %r11
+; AVX2-NEXT:    shrq $4, %r11
+; AVX2-NEXT:    andq %rsi, %r11
+; AVX2-NEXT:    andq %rsi, %r10
+; AVX2-NEXT:    shlq $4, %r10
+; AVX2-NEXT:    orq %r11, %r10
+; AVX2-NEXT:    movq %r10, %r11
+; AVX2-NEXT:    andq %rdx, %r11
+; AVX2-NEXT:    shrq $2, %r10
+; AVX2-NEXT:    andq %rdx, %r10
+; AVX2-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX2-NEXT:    movq %r10, %r11
+; AVX2-NEXT:    andq %rdi, %r11
+; AVX2-NEXT:    shrq %r10
+; AVX2-NEXT:    andq %rdi, %r10
+; AVX2-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX2-NEXT:    bswapq %r9
+; AVX2-NEXT:    movq %r9, %r11
+; AVX2-NEXT:    shrq $4, %r11
+; AVX2-NEXT:    andq %rsi, %r11
+; AVX2-NEXT:    andq %rsi, %r9
+; AVX2-NEXT:    shlq $4, %r9
+; AVX2-NEXT:    orq %r11, %r9
+; AVX2-NEXT:    movq %r9, %r11
+; AVX2-NEXT:    andq %rdx, %r11
+; AVX2-NEXT:    shrq $2, %r9
+; AVX2-NEXT:    andq %rdx, %r9
+; AVX2-NEXT:    leaq (%r9,%r11,4), %r9
+; AVX2-NEXT:    movq %r9, %r11
+; AVX2-NEXT:    andq %rdi, %r11
+; AVX2-NEXT:    shrq %r9
+; AVX2-NEXT:    andq %rdi, %r9
+; AVX2-NEXT:    leaq (%r9,%r11,2), %r9
+; AVX2-NEXT:    bswapq %rcx
+; AVX2-NEXT:    movq %rcx, %r11
+; AVX2-NEXT:    shrq $4, %r11
+; AVX2-NEXT:    andq %rsi, %r11
+; AVX2-NEXT:    andq %rsi, %rcx
+; AVX2-NEXT:    shlq $4, %rcx
+; AVX2-NEXT:    orq %r11, %rcx
+; AVX2-NEXT:    movq %rcx, %rsi
+; AVX2-NEXT:    andq %rdx, %rsi
+; AVX2-NEXT:    shrq $2, %rcx
+; AVX2-NEXT:    andq %rdx, %rcx
+; AVX2-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    andq %rdi, %rdx
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andq %rdi, %rcx
+; AVX2-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX2-NEXT:    movq %rcx, 24(%rax)
+; AVX2-NEXT:    movq %r9, 16(%rax)
+; AVX2-NEXT:    movq %r10, 8(%rax)
+; AVX2-NEXT:    movq %r8, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitreverse_i256_vector:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512F-NEXT:    vmovq %xmm0, %rcx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512F-NEXT:    vmovq %xmm0, %r10
+; AVX512F-NEXT:    bswapq %rdi
+; AVX512F-NEXT:    movq %rdi, %rdx
+; AVX512F-NEXT:    shrq $4, %rdx
+; AVX512F-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; AVX512F-NEXT:    andq %rsi, %rdx
+; AVX512F-NEXT:    andq %rsi, %rdi
+; AVX512F-NEXT:    shlq $4, %rdi
+; AVX512F-NEXT:    orq %rdx, %rdi
+; AVX512F-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX512F-NEXT:    movq %rdi, %r8
+; AVX512F-NEXT:    andq %rdx, %r8
+; AVX512F-NEXT:    shrq $2, %rdi
+; AVX512F-NEXT:    andq %rdx, %rdi
+; AVX512F-NEXT:    leaq (%rdi,%r8,4), %r8
+; AVX512F-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX512F-NEXT:    movq %r8, %r11
+; AVX512F-NEXT:    andq %rdi, %r11
+; AVX512F-NEXT:    shrq %r8
+; AVX512F-NEXT:    andq %rdi, %r8
+; AVX512F-NEXT:    bswapq %r10
+; AVX512F-NEXT:    leaq (%r8,%r11,2), %r8
+; AVX512F-NEXT:    movq %r10, %r11
+; AVX512F-NEXT:    shrq $4, %r11
+; AVX512F-NEXT:    andq %rsi, %r11
+; AVX512F-NEXT:    andq %rsi, %r10
+; AVX512F-NEXT:    shlq $4, %r10
+; AVX512F-NEXT:    orq %r11, %r10
+; AVX512F-NEXT:    movq %r10, %r11
+; AVX512F-NEXT:    andq %rdx, %r11
+; AVX512F-NEXT:    shrq $2, %r10
+; AVX512F-NEXT:    andq %rdx, %r10
+; AVX512F-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512F-NEXT:    movq %r10, %r11
+; AVX512F-NEXT:    andq %rdi, %r11
+; AVX512F-NEXT:    shrq %r10
+; AVX512F-NEXT:    andq %rdi, %r10
+; AVX512F-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512F-NEXT:    bswapq %r9
+; AVX512F-NEXT:    movq %r9, %r11
+; AVX512F-NEXT:    shrq $4, %r11
+; AVX512F-NEXT:    andq %rsi, %r11
+; AVX512F-NEXT:    andq %rsi, %r9
+; AVX512F-NEXT:    shlq $4, %r9
+; AVX512F-NEXT:    orq %r11, %r9
+; AVX512F-NEXT:    movq %r9, %r11
+; AVX512F-NEXT:    andq %rdx, %r11
+; AVX512F-NEXT:    shrq $2, %r9
+; AVX512F-NEXT:    andq %rdx, %r9
+; AVX512F-NEXT:    leaq (%r9,%r11,4), %r9
+; AVX512F-NEXT:    movq %r9, %r11
+; AVX512F-NEXT:    andq %rdi, %r11
+; AVX512F-NEXT:    shrq %r9
+; AVX512F-NEXT:    andq %rdi, %r9
+; AVX512F-NEXT:    leaq (%r9,%r11,2), %r9
+; AVX512F-NEXT:    bswapq %rcx
+; AVX512F-NEXT:    movq %rcx, %r11
+; AVX512F-NEXT:    shrq $4, %r11
+; AVX512F-NEXT:    andq %rsi, %r11
+; AVX512F-NEXT:    andq %rsi, %rcx
+; AVX512F-NEXT:    shlq $4, %rcx
+; AVX512F-NEXT:    orq %r11, %rcx
+; AVX512F-NEXT:    movq %rcx, %rsi
+; AVX512F-NEXT:    andq %rdx, %rsi
+; AVX512F-NEXT:    shrq $2, %rcx
+; AVX512F-NEXT:    andq %rdx, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; AVX512F-NEXT:    movq %rcx, %rdx
+; AVX512F-NEXT:    andq %rdi, %rdx
+; AVX512F-NEXT:    shrq %rcx
+; AVX512F-NEXT:    andq %rdi, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX512F-NEXT:    movq %rcx, 24(%rax)
+; AVX512F-NEXT:    movq %r9, 16(%rax)
+; AVX512F-NEXT:    movq %r10, 8(%rax)
+; AVX512F-NEXT:    movq %r8, (%rax)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: bitreverse_i256_vector:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512VL-NEXT:    vmovq %xmm0, %rcx
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512VL-NEXT:    bswapq %rdi
+; AVX512VL-NEXT:    vmovq %xmm0, %r10
+; AVX512VL-NEXT:    movq %rdi, %rdx
+; AVX512VL-NEXT:    shrq $4, %rdx
+; AVX512VL-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VL-NEXT:    andq %rsi, %rdx
+; AVX512VL-NEXT:    andq %rsi, %rdi
+; AVX512VL-NEXT:    shlq $4, %rdi
+; AVX512VL-NEXT:    orq %rdx, %rdi
+; AVX512VL-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX512VL-NEXT:    movq %rdi, %r8
+; AVX512VL-NEXT:    andq %rdx, %r8
+; AVX512VL-NEXT:    shrq $2, %rdi
+; AVX512VL-NEXT:    andq %rdx, %rdi
+; AVX512VL-NEXT:    leaq (%rdi,%r8,4), %r8
+; AVX512VL-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX512VL-NEXT:    movq %r8, %r11
+; AVX512VL-NEXT:    andq %rdi, %r11
+; AVX512VL-NEXT:    shrq %r8
+; AVX512VL-NEXT:    andq %rdi, %r8
+; AVX512VL-NEXT:    bswapq %r10
+; AVX512VL-NEXT:    leaq (%r8,%r11,2), %r8
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    shrq $4, %r11
+; AVX512VL-NEXT:    andq %rsi, %r11
+; AVX512VL-NEXT:    andq %rsi, %r10
+; AVX512VL-NEXT:    shlq $4, %r10
+; AVX512VL-NEXT:    orq %r11, %r10
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    andq %rdx, %r11
+; AVX512VL-NEXT:    shrq $2, %r10
+; AVX512VL-NEXT:    andq %rdx, %r10
+; AVX512VL-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    andq %rdi, %r11
+; AVX512VL-NEXT:    shrq %r10
+; AVX512VL-NEXT:    andq %rdi, %r10
+; AVX512VL-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512VL-NEXT:    bswapq %r9
+; AVX512VL-NEXT:    movq %r9, %r11
+; AVX512VL-NEXT:    shrq $4, %r11
+; AVX512VL-NEXT:    andq %rsi, %r11
+; AVX512VL-NEXT:    andq %rsi, %r9
+; AVX512VL-NEXT:    shlq $4, %r9
+; AVX512VL-NEXT:    orq %r11, %r9
+; AVX512VL-NEXT:    movq %r9, %r11
+; AVX512VL-NEXT:    andq %rdx, %r11
+; AVX512VL-NEXT:    shrq $2, %r9
+; AVX512VL-NEXT:    andq %rdx, %r9
+; AVX512VL-NEXT:    leaq (%r9,%r11,4), %r9
+; AVX512VL-NEXT:    movq %r9, %r11
+; AVX512VL-NEXT:    andq %rdi, %r11
+; AVX512VL-NEXT:    shrq %r9
+; AVX512VL-NEXT:    andq %rdi, %r9
+; AVX512VL-NEXT:    bswapq %rcx
+; AVX512VL-NEXT:    leaq (%r9,%r11,2), %r9
+; AVX512VL-NEXT:    movq %rcx, %r11
+; AVX512VL-NEXT:    shrq $4, %r11
+; AVX512VL-NEXT:    andq %rsi, %r11
+; AVX512VL-NEXT:    andq %rsi, %rcx
+; AVX512VL-NEXT:    shlq $4, %rcx
+; AVX512VL-NEXT:    orq %r11, %rcx
+; AVX512VL-NEXT:    movq %rcx, %rsi
+; AVX512VL-NEXT:    andq %rdx, %rsi
+; AVX512VL-NEXT:    shrq $2, %rcx
+; AVX512VL-NEXT:    andq %rdx, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; AVX512VL-NEXT:    movq %rcx, %rdx
+; AVX512VL-NEXT:    andq %rdi, %rdx
+; AVX512VL-NEXT:    shrq %rcx
+; AVX512VL-NEXT:    andq %rdi, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX512VL-NEXT:    movq %rcx, 24(%rax)
+; AVX512VL-NEXT:    movq %r9, 16(%rax)
+; AVX512VL-NEXT:    movq %r10, 8(%rax)
+; AVX512VL-NEXT:    movq %r8, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bitreverse_i256_vector:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %r9
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rcx
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512VBMI-NEXT:    bswapq %rdi
+; AVX512VBMI-NEXT:    vmovq %xmm0, %r10
+; AVX512VBMI-NEXT:    movq %rdi, %rdx
+; AVX512VBMI-NEXT:    shrq $4, %rdx
+; AVX512VBMI-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VBMI-NEXT:    andq %rsi, %rdx
+; AVX512VBMI-NEXT:    andq %rsi, %rdi
+; AVX512VBMI-NEXT:    shlq $4, %rdi
+; AVX512VBMI-NEXT:    orq %rdx, %rdi
+; AVX512VBMI-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX512VBMI-NEXT:    movq %rdi, %r8
+; AVX512VBMI-NEXT:    andq %rdx, %r8
+; AVX512VBMI-NEXT:    shrq $2, %rdi
+; AVX512VBMI-NEXT:    andq %rdx, %rdi
+; AVX512VBMI-NEXT:    leaq (%rdi,%r8,4), %r8
+; AVX512VBMI-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX512VBMI-NEXT:    movq %r8, %r11
+; AVX512VBMI-NEXT:    andq %rdi, %r11
+; AVX512VBMI-NEXT:    shrq %r8
+; AVX512VBMI-NEXT:    andq %rdi, %r8
+; AVX512VBMI-NEXT:    bswapq %r10
+; AVX512VBMI-NEXT:    leaq (%r8,%r11,2), %r8
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    shrq $4, %r11
+; AVX512VBMI-NEXT:    andq %rsi, %r11
+; AVX512VBMI-NEXT:    andq %rsi, %r10
+; AVX512VBMI-NEXT:    shlq $4, %r10
+; AVX512VBMI-NEXT:    orq %r11, %r10
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    andq %rdx, %r11
+; AVX512VBMI-NEXT:    shrq $2, %r10
+; AVX512VBMI-NEXT:    andq %rdx, %r10
+; AVX512VBMI-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    andq %rdi, %r11
+; AVX512VBMI-NEXT:    shrq %r10
+; AVX512VBMI-NEXT:    andq %rdi, %r10
+; AVX512VBMI-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512VBMI-NEXT:    bswapq %r9
+; AVX512VBMI-NEXT:    movq %r9, %r11
+; AVX512VBMI-NEXT:    shrq $4, %r11
+; AVX512VBMI-NEXT:    andq %rsi, %r11
+; AVX512VBMI-NEXT:    andq %rsi, %r9
+; AVX512VBMI-NEXT:    shlq $4, %r9
+; AVX512VBMI-NEXT:    orq %r11, %r9
+; AVX512VBMI-NEXT:    movq %r9, %r11
+; AVX512VBMI-NEXT:    andq %rdx, %r11
+; AVX512VBMI-NEXT:    shrq $2, %r9
+; AVX512VBMI-NEXT:    andq %rdx, %r9
+; AVX512VBMI-NEXT:    leaq (%r9,%r11,4), %r9
+; AVX512VBMI-NEXT:    movq %r9, %r11
+; AVX512VBMI-NEXT:    andq %rdi, %r11
+; AVX512VBMI-NEXT:    shrq %r9
+; AVX512VBMI-NEXT:    andq %rdi, %r9
+; AVX512VBMI-NEXT:    bswapq %rcx
+; AVX512VBMI-NEXT:    leaq (%r9,%r11,2), %r9
+; AVX512VBMI-NEXT:    movq %rcx, %r11
+; AVX512VBMI-NEXT:    shrq $4, %r11
+; AVX512VBMI-NEXT:    andq %rsi, %r11
+; AVX512VBMI-NEXT:    andq %rsi, %rcx
+; AVX512VBMI-NEXT:    shlq $4, %rcx
+; AVX512VBMI-NEXT:    orq %r11, %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %rsi
+; AVX512VBMI-NEXT:    andq %rdx, %rsi
+; AVX512VBMI-NEXT:    shrq $2, %rcx
+; AVX512VBMI-NEXT:    andq %rdx, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %rdx
+; AVX512VBMI-NEXT:    andq %rdi, %rdx
+; AVX512VBMI-NEXT:    shrq %rcx
+; AVX512VBMI-NEXT:    andq %rdi, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX512VBMI-NEXT:    movq %rcx, 24(%rax)
+; AVX512VBMI-NEXT:    movq %r9, 16(%rax)
+; AVX512VBMI-NEXT:    movq %r10, 8(%rax)
+; AVX512VBMI-NEXT:    movq %r8, (%rax)
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %a0 = bitcast <4 x i64> %v0 to i256
+  %res = call i256 @llvm.bitreverse.i256(i256 %a0)
+  ret i256 %res
+}
+
+define i256 @bitreverse_i256_load(ptr %p0) nounwind {
+; SSE-LABEL: bitreverse_i256_load:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq (%rsi), %rcx
+; SSE-NEXT:    movq 8(%rsi), %r9
+; SSE-NEXT:    movq 16(%rsi), %r10
+; SSE-NEXT:    movq 24(%rsi), %rdi
+; SSE-NEXT:    bswapq %rdi
+; SSE-NEXT:    movq %rdi, %rdx
+; SSE-NEXT:    shrq $4, %rdx
+; SSE-NEXT:    movabsq $1085102592571150095, %rsi # imm = 0xF0F0F0F0F0F0F0F
+; SSE-NEXT:    andq %rsi, %rdx
+; SSE-NEXT:    andq %rsi, %rdi
+; SSE-NEXT:    shlq $4, %rdi
+; SSE-NEXT:    orq %rdx, %rdi
+; SSE-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; SSE-NEXT:    movq %rdi, %r8
+; SSE-NEXT:    andq %rdx, %r8
+; SSE-NEXT:    shrq $2, %rdi
+; SSE-NEXT:    andq %rdx, %rdi
+; SSE-NEXT:    leaq (%rdi,%r8,4), %r8
+; SSE-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; SSE-NEXT:    movq %r8, %r11
+; SSE-NEXT:    andq %rdi, %r11
+; SSE-NEXT:    shrq %r8
+; SSE-NEXT:    andq %rdi, %r8
+; SSE-NEXT:    leaq (%r8,%r11,2), %r8
+; SSE-NEXT:    bswapq %r10
+; SSE-NEXT:    movq %r10, %r11
+; SSE-NEXT:    shrq $4, %r11
+; SSE-NEXT:    andq %rsi, %r11
+; SSE-NEXT:    andq %rsi, %r10
+; SSE-NEXT:    shlq $4, %r10
+; SSE-NEXT:    orq %r11, %r10
+; SSE-NEXT:    movq %r10, %r11
+; SSE-NEXT:    andq %rdx, %r11
+; SSE-NEXT:    shrq $2, %r10
+; SSE-NEXT:    andq %rdx, %r10
+; SSE-NEXT:    leaq (%r10,%r11,4), %r10
+; SSE-NEXT:    movq %r10, %r11
+; SSE-NEXT:    andq %rdi, %r11
+; SSE-NEXT:    shrq %r10
+; SSE-NEXT:    andq %rdi, %r10
+; SSE-NEXT:    bswapq %r9
+; SSE-NEXT:    leaq (%r10,%r11,2), %r10
+; SSE-NEXT:    movq %r9, %r11
+; SSE-NEXT:    shrq $4, %r11
+; SSE-NEXT:    andq %rsi, %r11
+; SSE-NEXT:    andq %rsi, %r9
+; SSE-NEXT:    shlq $4, %r9
+; SSE-NEXT:    orq %r11, %r9
+; SSE-NEXT:    movq %r9, %r11
+; SSE-NEXT:    andq %rdx, %r11
+; SSE-NEXT:    shrq $2, %r9
+; SSE-NEXT:    andq %rdx, %r9
+; SSE-NEXT:    leaq (%r9,%r11,4), %r9
+; SSE-NEXT:    movq %r9, %r11
+; SSE-NEXT:    andq %rdi, %r11
+; SSE-NEXT:    shrq %r9
+; SSE-NEXT:    andq %rdi, %r9
+; SSE-NEXT:    leaq (%r9,%r11,2), %r9
+; SSE-NEXT:    bswapq %rcx
+; SSE-NEXT:    movq %rcx, %r11
+; SSE-NEXT:    shrq $4, %r11
+; SSE-NEXT:    andq %rsi, %r11
+; SSE-NEXT:    andq %rsi, %rcx
+; SSE-NEXT:    shlq $4, %rcx
+; SSE-NEXT:    orq %r11, %rcx
+; SSE-NEXT:    movq %rcx, %rsi
+; SSE-NEXT:    andq %rdx, %rsi
+; SSE-NEXT:    shrq $2, %rcx
+; SSE-NEXT:    andq %rdx, %rcx
+; SSE-NEXT:    leaq (%rcx,%rsi,4), %rcx
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    andq %rdi, %rdx
+; SSE-NEXT:    shrq %rcx
+; SSE-NEXT:    andq %rdi, %rcx
+; SSE-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; SSE-NEXT:    movq %rcx, 24(%rax)
+; SSE-NEXT:    movq %r9, 16(%rax)
+; SSE-NEXT:    movq %r10, 8(%rax)
+; SSE-NEXT:    movq %r8, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: bitreverse_i256_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movbeq 24(%rsi), %rdi
+; AVX2-NEXT:    movq %rdi, %rcx
+; AVX2-NEXT:    shrq $4, %rcx
+; AVX2-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; AVX2-NEXT:    andq %rdx, %rcx
+; AVX2-NEXT:    andq %rdx, %rdi
+; AVX2-NEXT:    shlq $4, %rdi
+; AVX2-NEXT:    orq %rcx, %rdi
+; AVX2-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; AVX2-NEXT:    movq %rdi, %r8
+; AVX2-NEXT:    andq %rcx, %r8
+; AVX2-NEXT:    shrq $2, %rdi
+; AVX2-NEXT:    andq %rcx, %rdi
+; AVX2-NEXT:    leaq (%rdi,%r8,4), %r8
+; AVX2-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX2-NEXT:    movq %r8, %r9
+; AVX2-NEXT:    andq %rdi, %r9
+; AVX2-NEXT:    shrq %r8
+; AVX2-NEXT:    andq %rdi, %r8
+; AVX2-NEXT:    movbeq 16(%rsi), %r10
+; AVX2-NEXT:    leaq (%r8,%r9,2), %r8
+; AVX2-NEXT:    movq %r10, %r9
+; AVX2-NEXT:    shrq $4, %r9
+; AVX2-NEXT:    andq %rdx, %r9
+; AVX2-NEXT:    andq %rdx, %r10
+; AVX2-NEXT:    shlq $4, %r10
+; AVX2-NEXT:    orq %r9, %r10
+; AVX2-NEXT:    movq %r10, %r9
+; AVX2-NEXT:    andq %rcx, %r9
+; AVX2-NEXT:    shrq $2, %r10
+; AVX2-NEXT:    andq %rcx, %r10
+; AVX2-NEXT:    leaq (%r10,%r9,4), %r9
+; AVX2-NEXT:    movq %r9, %r10
+; AVX2-NEXT:    andq %rdi, %r10
+; AVX2-NEXT:    shrq %r9
+; AVX2-NEXT:    andq %rdi, %r9
+; AVX2-NEXT:    leaq (%r9,%r10,2), %r9
+; AVX2-NEXT:    movbeq 8(%rsi), %r10
+; AVX2-NEXT:    movq %r10, %r11
+; AVX2-NEXT:    shrq $4, %r11
+; AVX2-NEXT:    andq %rdx, %r11
+; AVX2-NEXT:    andq %rdx, %r10
+; AVX2-NEXT:    shlq $4, %r10
+; AVX2-NEXT:    orq %r11, %r10
+; AVX2-NEXT:    movq %r10, %r11
+; AVX2-NEXT:    andq %rcx, %r11
+; AVX2-NEXT:    shrq $2, %r10
+; AVX2-NEXT:    andq %rcx, %r10
+; AVX2-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX2-NEXT:    movq %r10, %r11
+; AVX2-NEXT:    andq %rdi, %r11
+; AVX2-NEXT:    shrq %r10
+; AVX2-NEXT:    andq %rdi, %r10
+; AVX2-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX2-NEXT:    movbeq (%rsi), %rsi
+; AVX2-NEXT:    movq %rsi, %r11
+; AVX2-NEXT:    shrq $4, %r11
+; AVX2-NEXT:    andq %rdx, %r11
+; AVX2-NEXT:    andq %rdx, %rsi
+; AVX2-NEXT:    shlq $4, %rsi
+; AVX2-NEXT:    orq %r11, %rsi
+; AVX2-NEXT:    movq %rsi, %rdx
+; AVX2-NEXT:    andq %rcx, %rdx
+; AVX2-NEXT:    shrq $2, %rsi
+; AVX2-NEXT:    andq %rcx, %rsi
+; AVX2-NEXT:    leaq (%rsi,%rdx,4), %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    andq %rdi, %rdx
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andq %rdi, %rcx
+; AVX2-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX2-NEXT:    movq %rcx, 24(%rax)
+; AVX2-NEXT:    movq %r10, 16(%rax)
+; AVX2-NEXT:    movq %r9, 8(%rax)
+; AVX2-NEXT:    movq %r8, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitreverse_i256_load:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movbeq 24(%rsi), %rdi
+; AVX512F-NEXT:    movq %rdi, %rcx
+; AVX512F-NEXT:    shrq $4, %rcx
+; AVX512F-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; AVX512F-NEXT:    andq %rdx, %rcx
+; AVX512F-NEXT:    andq %rdx, %rdi
+; AVX512F-NEXT:    shlq $4, %rdi
+; AVX512F-NEXT:    orq %rcx, %rdi
+; AVX512F-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; AVX512F-NEXT:    movq %rdi, %r8
+; AVX512F-NEXT:    andq %rcx, %r8
+; AVX512F-NEXT:    shrq $2, %rdi
+; AVX512F-NEXT:    andq %rcx, %rdi
+; AVX512F-NEXT:    leaq (%rdi,%r8,4), %r8
+; AVX512F-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX512F-NEXT:    movq %r8, %r9
+; AVX512F-NEXT:    andq %rdi, %r9
+; AVX512F-NEXT:    shrq %r8
+; AVX512F-NEXT:    andq %rdi, %r8
+; AVX512F-NEXT:    movbeq 16(%rsi), %r10
+; AVX512F-NEXT:    leaq (%r8,%r9,2), %r8
+; AVX512F-NEXT:    movq %r10, %r9
+; AVX512F-NEXT:    shrq $4, %r9
+; AVX512F-NEXT:    andq %rdx, %r9
+; AVX512F-NEXT:    andq %rdx, %r10
+; AVX512F-NEXT:    shlq $4, %r10
+; AVX512F-NEXT:    orq %r9, %r10
+; AVX512F-NEXT:    movq %r10, %r9
+; AVX512F-NEXT:    andq %rcx, %r9
+; AVX512F-NEXT:    shrq $2, %r10
+; AVX512F-NEXT:    andq %rcx, %r10
+; AVX512F-NEXT:    leaq (%r10,%r9,4), %r9
+; AVX512F-NEXT:    movq %r9, %r10
+; AVX512F-NEXT:    andq %rdi, %r10
+; AVX512F-NEXT:    shrq %r9
+; AVX512F-NEXT:    andq %rdi, %r9
+; AVX512F-NEXT:    leaq (%r9,%r10,2), %r9
+; AVX512F-NEXT:    movbeq 8(%rsi), %r10
+; AVX512F-NEXT:    movq %r10, %r11
+; AVX512F-NEXT:    shrq $4, %r11
+; AVX512F-NEXT:    andq %rdx, %r11
+; AVX512F-NEXT:    andq %rdx, %r10
+; AVX512F-NEXT:    shlq $4, %r10
+; AVX512F-NEXT:    orq %r11, %r10
+; AVX512F-NEXT:    movq %r10, %r11
+; AVX512F-NEXT:    andq %rcx, %r11
+; AVX512F-NEXT:    shrq $2, %r10
+; AVX512F-NEXT:    andq %rcx, %r10
+; AVX512F-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512F-NEXT:    movq %r10, %r11
+; AVX512F-NEXT:    andq %rdi, %r11
+; AVX512F-NEXT:    shrq %r10
+; AVX512F-NEXT:    andq %rdi, %r10
+; AVX512F-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512F-NEXT:    movbeq (%rsi), %rsi
+; AVX512F-NEXT:    movq %rsi, %r11
+; AVX512F-NEXT:    shrq $4, %r11
+; AVX512F-NEXT:    andq %rdx, %r11
+; AVX512F-NEXT:    andq %rdx, %rsi
+; AVX512F-NEXT:    shlq $4, %rsi
+; AVX512F-NEXT:    orq %r11, %rsi
+; AVX512F-NEXT:    movq %rsi, %rdx
+; AVX512F-NEXT:    andq %rcx, %rdx
+; AVX512F-NEXT:    shrq $2, %rsi
+; AVX512F-NEXT:    andq %rcx, %rsi
+; AVX512F-NEXT:    leaq (%rsi,%rdx,4), %rcx
+; AVX512F-NEXT:    movq %rcx, %rdx
+; AVX512F-NEXT:    andq %rdi, %rdx
+; AVX512F-NEXT:    shrq %rcx
+; AVX512F-NEXT:    andq %rdi, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX512F-NEXT:    movq %rcx, 24(%rax)
+; AVX512F-NEXT:    movq %r10, 16(%rax)
+; AVX512F-NEXT:    movq %r9, 8(%rax)
+; AVX512F-NEXT:    movq %r8, (%rax)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: bitreverse_i256_load:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movbeq 24(%rsi), %r8
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    movq %r8, %rcx
+; AVX512VL-NEXT:    shrq $4, %rcx
+; AVX512VL-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VL-NEXT:    andq %rdx, %rcx
+; AVX512VL-NEXT:    andq %rdx, %r8
+; AVX512VL-NEXT:    shlq $4, %r8
+; AVX512VL-NEXT:    orq %rcx, %r8
+; AVX512VL-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; AVX512VL-NEXT:    movq %r8, %rdi
+; AVX512VL-NEXT:    andq %rcx, %rdi
+; AVX512VL-NEXT:    shrq $2, %r8
+; AVX512VL-NEXT:    andq %rcx, %r8
+; AVX512VL-NEXT:    leaq (%r8,%rdi,4), %r8
+; AVX512VL-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX512VL-NEXT:    movq %r8, %r9
+; AVX512VL-NEXT:    andq %rdi, %r9
+; AVX512VL-NEXT:    shrq %r8
+; AVX512VL-NEXT:    andq %rdi, %r8
+; AVX512VL-NEXT:    movbeq 16(%rsi), %r10
+; AVX512VL-NEXT:    leaq (%r8,%r9,2), %r8
+; AVX512VL-NEXT:    movq %r10, %r9
+; AVX512VL-NEXT:    shrq $4, %r9
+; AVX512VL-NEXT:    andq %rdx, %r9
+; AVX512VL-NEXT:    andq %rdx, %r10
+; AVX512VL-NEXT:    shlq $4, %r10
+; AVX512VL-NEXT:    orq %r9, %r10
+; AVX512VL-NEXT:    movq %r10, %r9
+; AVX512VL-NEXT:    andq %rcx, %r9
+; AVX512VL-NEXT:    shrq $2, %r10
+; AVX512VL-NEXT:    andq %rcx, %r10
+; AVX512VL-NEXT:    leaq (%r10,%r9,4), %r9
+; AVX512VL-NEXT:    movq %r9, %r10
+; AVX512VL-NEXT:    andq %rdi, %r10
+; AVX512VL-NEXT:    shrq %r9
+; AVX512VL-NEXT:    andq %rdi, %r9
+; AVX512VL-NEXT:    leaq (%r9,%r10,2), %r9
+; AVX512VL-NEXT:    movbeq 8(%rsi), %r10
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    shrq $4, %r11
+; AVX512VL-NEXT:    andq %rdx, %r11
+; AVX512VL-NEXT:    andq %rdx, %r10
+; AVX512VL-NEXT:    shlq $4, %r10
+; AVX512VL-NEXT:    orq %r11, %r10
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    andq %rcx, %r11
+; AVX512VL-NEXT:    shrq $2, %r10
+; AVX512VL-NEXT:    andq %rcx, %r10
+; AVX512VL-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    andq %rdi, %r11
+; AVX512VL-NEXT:    shrq %r10
+; AVX512VL-NEXT:    andq %rdi, %r10
+; AVX512VL-NEXT:    movbeq (%rsi), %rsi
+; AVX512VL-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512VL-NEXT:    movq %rsi, %r11
+; AVX512VL-NEXT:    shrq $4, %r11
+; AVX512VL-NEXT:    andq %rdx, %r11
+; AVX512VL-NEXT:    andq %rdx, %rsi
+; AVX512VL-NEXT:    shlq $4, %rsi
+; AVX512VL-NEXT:    orq %r11, %rsi
+; AVX512VL-NEXT:    movq %rsi, %rdx
+; AVX512VL-NEXT:    andq %rcx, %rdx
+; AVX512VL-NEXT:    shrq $2, %rsi
+; AVX512VL-NEXT:    andq %rcx, %rsi
+; AVX512VL-NEXT:    leaq (%rsi,%rdx,4), %rcx
+; AVX512VL-NEXT:    movq %rcx, %rdx
+; AVX512VL-NEXT:    andq %rdi, %rdx
+; AVX512VL-NEXT:    shrq %rcx
+; AVX512VL-NEXT:    andq %rdi, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX512VL-NEXT:    movq %rcx, 24(%rax)
+; AVX512VL-NEXT:    movq %r10, 16(%rax)
+; AVX512VL-NEXT:    movq %r9, 8(%rax)
+; AVX512VL-NEXT:    movq %r8, (%rax)
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bitreverse_i256_load:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    movbeq 24(%rsi), %r8
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movq %r8, %rcx
+; AVX512VBMI-NEXT:    shrq $4, %rcx
+; AVX512VBMI-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VBMI-NEXT:    andq %rdx, %rcx
+; AVX512VBMI-NEXT:    andq %rdx, %r8
+; AVX512VBMI-NEXT:    shlq $4, %r8
+; AVX512VBMI-NEXT:    orq %rcx, %r8
+; AVX512VBMI-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; AVX512VBMI-NEXT:    movq %r8, %rdi
+; AVX512VBMI-NEXT:    andq %rcx, %rdi
+; AVX512VBMI-NEXT:    shrq $2, %r8
+; AVX512VBMI-NEXT:    andq %rcx, %r8
+; AVX512VBMI-NEXT:    leaq (%r8,%rdi,4), %r8
+; AVX512VBMI-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX512VBMI-NEXT:    movq %r8, %r9
+; AVX512VBMI-NEXT:    andq %rdi, %r9
+; AVX512VBMI-NEXT:    shrq %r8
+; AVX512VBMI-NEXT:    andq %rdi, %r8
+; AVX512VBMI-NEXT:    movbeq 16(%rsi), %r10
+; AVX512VBMI-NEXT:    leaq (%r8,%r9,2), %r8
+; AVX512VBMI-NEXT:    movq %r10, %r9
+; AVX512VBMI-NEXT:    shrq $4, %r9
+; AVX512VBMI-NEXT:    andq %rdx, %r9
+; AVX512VBMI-NEXT:    andq %rdx, %r10
+; AVX512VBMI-NEXT:    shlq $4, %r10
+; AVX512VBMI-NEXT:    orq %r9, %r10
+; AVX512VBMI-NEXT:    movq %r10, %r9
+; AVX512VBMI-NEXT:    andq %rcx, %r9
+; AVX512VBMI-NEXT:    shrq $2, %r10
+; AVX512VBMI-NEXT:    andq %rcx, %r10
+; AVX512VBMI-NEXT:    leaq (%r10,%r9,4), %r9
+; AVX512VBMI-NEXT:    movq %r9, %r10
+; AVX512VBMI-NEXT:    andq %rdi, %r10
+; AVX512VBMI-NEXT:    shrq %r9
+; AVX512VBMI-NEXT:    andq %rdi, %r9
+; AVX512VBMI-NEXT:    leaq (%r9,%r10,2), %r9
+; AVX512VBMI-NEXT:    movbeq 8(%rsi), %r10
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    shrq $4, %r11
+; AVX512VBMI-NEXT:    andq %rdx, %r11
+; AVX512VBMI-NEXT:    andq %rdx, %r10
+; AVX512VBMI-NEXT:    shlq $4, %r10
+; AVX512VBMI-NEXT:    orq %r11, %r10
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    andq %rcx, %r11
+; AVX512VBMI-NEXT:    shrq $2, %r10
+; AVX512VBMI-NEXT:    andq %rcx, %r10
+; AVX512VBMI-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    andq %rdi, %r11
+; AVX512VBMI-NEXT:    shrq %r10
+; AVX512VBMI-NEXT:    andq %rdi, %r10
+; AVX512VBMI-NEXT:    movbeq (%rsi), %rsi
+; AVX512VBMI-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512VBMI-NEXT:    movq %rsi, %r11
+; AVX512VBMI-NEXT:    shrq $4, %r11
+; AVX512VBMI-NEXT:    andq %rdx, %r11
+; AVX512VBMI-NEXT:    andq %rdx, %rsi
+; AVX512VBMI-NEXT:    shlq $4, %rsi
+; AVX512VBMI-NEXT:    orq %r11, %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %rdx
+; AVX512VBMI-NEXT:    andq %rcx, %rdx
+; AVX512VBMI-NEXT:    shrq $2, %rsi
+; AVX512VBMI-NEXT:    andq %rcx, %rsi
+; AVX512VBMI-NEXT:    leaq (%rsi,%rdx,4), %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %rdx
+; AVX512VBMI-NEXT:    andq %rdi, %rdx
+; AVX512VBMI-NEXT:    shrq %rcx
+; AVX512VBMI-NEXT:    andq %rdi, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX512VBMI-NEXT:    movq %rcx, 24(%rax)
+; AVX512VBMI-NEXT:    movq %r10, 16(%rax)
+; AVX512VBMI-NEXT:    movq %r9, 8(%rax)
+; AVX512VBMI-NEXT:    movq %r8, (%rax)
+; AVX512VBMI-NEXT:    retq
+  %a0 = load i256, ptr %p0
+  %res = call i256 @llvm.bitreverse.i256(i256 %a0)
+  ret i256 %res
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; AVX512: {{.*}}
 ; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/bit-manip-i512.ll b/llvm/test/CodeGen/X86/bit-manip-i512.ll
index 407df83d1b2c3..a33beb82872ab 100644
--- a/llvm/test/CodeGen/X86/bit-manip-i512.ll
+++ b/llvm/test/CodeGen/X86/bit-manip-i512.ll
@@ -5398,6 +5398,2991 @@ define i512 @isolate_msb_i512_load(ptr %p0, i512 %idx) nounwind {
   ret i512 %res
 }
 
+define i512 @bswap_i512(i512 %a0) nounwind {
+; SSE-LABEL: bswap_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    bswapq %rbx
+; SSE-NEXT:    bswapq %r11
+; SSE-NEXT:    bswapq %r10
+; SSE-NEXT:    bswapq %r9
+; SSE-NEXT:    bswapq %r8
+; SSE-NEXT:    bswapq %rcx
+; SSE-NEXT:    bswapq %rdx
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    bswapq %rsi
+; SSE-NEXT:    movq %rsi, 56(%rdi)
+; SSE-NEXT:    movq %rdx, 48(%rdi)
+; SSE-NEXT:    movq %rcx, 40(%rdi)
+; SSE-NEXT:    movq %r8, 32(%rdi)
+; SSE-NEXT:    movq %r9, 24(%rdi)
+; SSE-NEXT:    movq %r10, 16(%rdi)
+; SSE-NEXT:    movq %r11, 8(%rdi)
+; SSE-NEXT:    movq %rbx, (%rdi)
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: bswap_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movbeq %rsi, 56(%rdi)
+; AVX2-NEXT:    movbeq %rdx, 48(%rdi)
+; AVX2-NEXT:    movbeq %rcx, 40(%rdi)
+; AVX2-NEXT:    movbeq %r8, 32(%rdi)
+; AVX2-NEXT:    movbeq %r9, 24(%rdi)
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT:    movbeq %rcx, 16(%rdi)
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT:    movbeq %rcx, 8(%rdi)
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX2-NEXT:    movbeq %rcx, (%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bswap_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movbeq %rsi, 56(%rdi)
+; AVX512F-NEXT:    movbeq %rdx, 48(%rdi)
+; AVX512F-NEXT:    movbeq %rcx, 40(%rdi)
+; AVX512F-NEXT:    movbeq %r8, 32(%rdi)
+; AVX512F-NEXT:    movbeq %r9, 24(%rdi)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT:    movbeq %rcx, 16(%rdi)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT:    movbeq %rcx, 8(%rdi)
+; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512F-NEXT:    movbeq %rcx, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: bswap_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512VL-NEXT:    movbeq %rsi, 56(%rax)
+; AVX512VL-NEXT:    movbeq %rdx, 48(%rax)
+; AVX512VL-NEXT:    movbeq %rcx, 40(%rax)
+; AVX512VL-NEXT:    movbeq %r8, 32(%rax)
+; AVX512VL-NEXT:    movbeq %r9, 24(%rax)
+; AVX512VL-NEXT:    movbeq %r11, 16(%rax)
+; AVX512VL-NEXT:    movbeq %rdi, 8(%rax)
+; AVX512VL-NEXT:    movbeq %r10, (%rax)
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bswap_i512:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512VBMI-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512VBMI-NEXT:    movbeq %rsi, 56(%rax)
+; AVX512VBMI-NEXT:    movbeq %rdx, 48(%rax)
+; AVX512VBMI-NEXT:    movbeq %rcx, 40(%rax)
+; AVX512VBMI-NEXT:    movbeq %r8, 32(%rax)
+; AVX512VBMI-NEXT:    movbeq %r9, 24(%rax)
+; AVX512VBMI-NEXT:    movbeq %r11, 16(%rax)
+; AVX512VBMI-NEXT:    movbeq %rdi, 8(%rax)
+; AVX512VBMI-NEXT:    movbeq %r10, (%rax)
+; AVX512VBMI-NEXT:    retq
+  %res = call i512 @llvm.bswap.i512(i512 %a0)
+  ret i512 %res
+}
+
+define i512 @bswap_i512_vector(<8 x i64> %v0) nounwind {
+; SSE2-LABEL: bswap_i512_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm4, %rcx
+; SSE2-NEXT:    movq %xmm1, %rdx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %rsi
+; SSE2-NEXT:    movq %xmm2, %rdi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %r8
+; SSE2-NEXT:    movq %xmm3, %r9
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm1, %r10
+; SSE2-NEXT:    bswapq %r10
+; SSE2-NEXT:    bswapq %r9
+; SSE2-NEXT:    bswapq %r8
+; SSE2-NEXT:    bswapq %rdi
+; SSE2-NEXT:    bswapq %rsi
+; SSE2-NEXT:    bswapq %rdx
+; SSE2-NEXT:    bswapq %rcx
+; SSE2-NEXT:    movq %xmm0, %r11
+; SSE2-NEXT:    bswapq %r11
+; SSE2-NEXT:    movq %r11, 56(%rax)
+; SSE2-NEXT:    movq %rcx, 48(%rax)
+; SSE2-NEXT:    movq %rdx, 40(%rax)
+; SSE2-NEXT:    movq %rsi, 32(%rax)
+; SSE2-NEXT:    movq %rdi, 24(%rax)
+; SSE2-NEXT:    movq %r8, 16(%rax)
+; SSE2-NEXT:    movq %r9, 8(%rax)
+; SSE2-NEXT:    movq %r10, (%rax)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: bswap_i512_vector:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    pextrq $1, %xmm0, %rcx
+; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    movq %xmm1, %rdx
+; SSE42-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE42-NEXT:    movq %xmm2, %rdi
+; SSE42-NEXT:    pextrq $1, %xmm2, %r8
+; SSE42-NEXT:    movq %xmm3, %r9
+; SSE42-NEXT:    pextrq $1, %xmm3, %r10
+; SSE42-NEXT:    bswapq %r10
+; SSE42-NEXT:    bswapq %r9
+; SSE42-NEXT:    bswapq %r8
+; SSE42-NEXT:    bswapq %rdi
+; SSE42-NEXT:    bswapq %rsi
+; SSE42-NEXT:    bswapq %rdx
+; SSE42-NEXT:    bswapq %rcx
+; SSE42-NEXT:    movq %xmm0, %r11
+; SSE42-NEXT:    bswapq %r11
+; SSE42-NEXT:    movq %r11, 56(%rax)
+; SSE42-NEXT:    movq %rcx, 48(%rax)
+; SSE42-NEXT:    movq %rdx, 40(%rax)
+; SSE42-NEXT:    movq %rsi, 32(%rax)
+; SSE42-NEXT:    movq %rdi, 24(%rax)
+; SSE42-NEXT:    movq %r8, 16(%rax)
+; SSE42-NEXT:    movq %r9, 8(%rax)
+; SSE42-NEXT:    movq %r10, (%rax)
+; SSE42-NEXT:    retq
+;
+; AVX2-LABEL: bswap_i512_vector:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX2-NEXT:    vpextrq $1, %xmm3, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX2-NEXT:    vmovq %xmm0, %r8
+; AVX2-NEXT:    movbeq %r8, 56(%rax)
+; AVX2-NEXT:    movbeq %rdi, 48(%rax)
+; AVX2-NEXT:    vmovq %xmm3, %rdi
+; AVX2-NEXT:    movbeq %rdi, 40(%rax)
+; AVX2-NEXT:    movbeq %rsi, 32(%rax)
+; AVX2-NEXT:    vmovq %xmm1, %rsi
+; AVX2-NEXT:    movbeq %rsi, 24(%rax)
+; AVX2-NEXT:    movbeq %rdx, 16(%rax)
+; AVX2-NEXT:    vmovq %xmm2, %rdx
+; AVX2-NEXT:    movbeq %rdx, 8(%rax)
+; AVX2-NEXT:    movbeq %rcx, (%rax)
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bswap_i512_vector:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
+; AVX512F-NEXT:    vpextrq $1, %xmm2, %rdx
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
+; AVX512F-NEXT:    vpextrq $1, %xmm3, %rsi
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdi
+; AVX512F-NEXT:    vmovq %xmm0, %r8
+; AVX512F-NEXT:    movbeq %r8, 56(%rax)
+; AVX512F-NEXT:    movbeq %rdi, 48(%rax)
+; AVX512F-NEXT:    vmovq %xmm3, %rdi
+; AVX512F-NEXT:    movbeq %rdi, 40(%rax)
+; AVX512F-NEXT:    movbeq %rsi, 32(%rax)
+; AVX512F-NEXT:    vmovq %xmm2, %rsi
+; AVX512F-NEXT:    movbeq %rsi, 24(%rax)
+; AVX512F-NEXT:    movbeq %rdx, 16(%rax)
+; AVX512F-NEXT:    vmovq %xmm1, %rdx
+; AVX512F-NEXT:    movbeq %rdx, 8(%rax)
+; AVX512F-NEXT:    movbeq %rcx, (%rax)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: bswap_i512_vector:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512VL-NEXT:    vmovq %xmm1, %rdx
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512VL-NEXT:    vmovq %xmm1, %rdi
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512VL-NEXT:    vmovq %xmm1, %r9
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512VL-NEXT:    vmovq %xmm0, %r11
+; AVX512VL-NEXT:    movbeq %r11, 56(%rax)
+; AVX512VL-NEXT:    movbeq %r10, 48(%rax)
+; AVX512VL-NEXT:    movbeq %r9, 40(%rax)
+; AVX512VL-NEXT:    movbeq %r8, 32(%rax)
+; AVX512VL-NEXT:    movbeq %rdi, 24(%rax)
+; AVX512VL-NEXT:    movbeq %rsi, 16(%rax)
+; AVX512VL-NEXT:    movbeq %rdx, 8(%rax)
+; AVX512VL-NEXT:    movbeq %rcx, (%rax)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bswap_i512_vector:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %rcx
+; AVX512VBMI-NEXT:    vmovq %xmm1, %rdx
+; AVX512VBMI-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512VBMI-NEXT:    vmovq %xmm1, %rdi
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %r8
+; AVX512VBMI-NEXT:    vmovq %xmm1, %r9
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %r10
+; AVX512VBMI-NEXT:    vmovq %xmm0, %r11
+; AVX512VBMI-NEXT:    movbeq %r11, 56(%rax)
+; AVX512VBMI-NEXT:    movbeq %r10, 48(%rax)
+; AVX512VBMI-NEXT:    movbeq %r9, 40(%rax)
+; AVX512VBMI-NEXT:    movbeq %r8, 32(%rax)
+; AVX512VBMI-NEXT:    movbeq %rdi, 24(%rax)
+; AVX512VBMI-NEXT:    movbeq %rsi, 16(%rax)
+; AVX512VBMI-NEXT:    movbeq %rdx, 8(%rax)
+; AVX512VBMI-NEXT:    movbeq %rcx, (%rax)
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %a0 = bitcast <8 x i64> %v0 to i512
+  %res = call i512 @llvm.bswap.i512(i512 %a0)
+  ret i512 %res
+}
+
+define i512 @bswap_i512_load(ptr %p0) nounwind {
+; SSE-LABEL: bswap_i512_load:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq 8(%rsi), %rcx
+; SSE-NEXT:    movq 16(%rsi), %rdx
+; SSE-NEXT:    movq 24(%rsi), %rdi
+; SSE-NEXT:    movq 32(%rsi), %r8
+; SSE-NEXT:    movq 40(%rsi), %r9
+; SSE-NEXT:    movq 48(%rsi), %r10
+; SSE-NEXT:    movq 56(%rsi), %r11
+; SSE-NEXT:    bswapq %r11
+; SSE-NEXT:    bswapq %r10
+; SSE-NEXT:    bswapq %r9
+; SSE-NEXT:    bswapq %r8
+; SSE-NEXT:    bswapq %rdi
+; SSE-NEXT:    bswapq %rdx
+; SSE-NEXT:    bswapq %rcx
+; SSE-NEXT:    movq (%rsi), %rsi
+; SSE-NEXT:    bswapq %rsi
+; SSE-NEXT:    movq %rsi, 56(%rax)
+; SSE-NEXT:    movq %rcx, 48(%rax)
+; SSE-NEXT:    movq %rdx, 40(%rax)
+; SSE-NEXT:    movq %rdi, 32(%rax)
+; SSE-NEXT:    movq %r8, 24(%rax)
+; SSE-NEXT:    movq %r9, 16(%rax)
+; SSE-NEXT:    movq %r10, 8(%rax)
+; SSE-NEXT:    movq %r11, (%rax)
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: bswap_i512_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movq 56(%rsi), %rcx
+; AVX2-NEXT:    movq 48(%rsi), %rdx
+; AVX2-NEXT:    movq 40(%rsi), %rdi
+; AVX2-NEXT:    movq 32(%rsi), %r8
+; AVX2-NEXT:    movq 24(%rsi), %r9
+; AVX2-NEXT:    movq 16(%rsi), %r10
+; AVX2-NEXT:    movq (%rsi), %r11
+; AVX2-NEXT:    movq 8(%rsi), %rsi
+; AVX2-NEXT:    movbeq %r11, 56(%rax)
+; AVX2-NEXT:    movbeq %rsi, 48(%rax)
+; AVX2-NEXT:    movbeq %r10, 40(%rax)
+; AVX2-NEXT:    movbeq %r9, 32(%rax)
+; AVX2-NEXT:    movbeq %r8, 24(%rax)
+; AVX2-NEXT:    movbeq %rdi, 16(%rax)
+; AVX2-NEXT:    movbeq %rdx, 8(%rax)
+; AVX2-NEXT:    movbeq %rcx, (%rax)
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: bswap_i512_load:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movq %rdi, %rax
+; AVX512-NEXT:    movq 56(%rsi), %rcx
+; AVX512-NEXT:    movq 48(%rsi), %rdx
+; AVX512-NEXT:    movq 40(%rsi), %rdi
+; AVX512-NEXT:    movq 32(%rsi), %r8
+; AVX512-NEXT:    movq 24(%rsi), %r9
+; AVX512-NEXT:    movq 16(%rsi), %r10
+; AVX512-NEXT:    movq (%rsi), %r11
+; AVX512-NEXT:    movq 8(%rsi), %rsi
+; AVX512-NEXT:    movbeq %r11, 56(%rax)
+; AVX512-NEXT:    movbeq %rsi, 48(%rax)
+; AVX512-NEXT:    movbeq %r10, 40(%rax)
+; AVX512-NEXT:    movbeq %r9, 32(%rax)
+; AVX512-NEXT:    movbeq %r8, 24(%rax)
+; AVX512-NEXT:    movbeq %rdi, 16(%rax)
+; AVX512-NEXT:    movbeq %rdx, 8(%rax)
+; AVX512-NEXT:    movbeq %rcx, (%rax)
+; AVX512-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %res = call i512 @llvm.bswap.i512(i512 %a0)
+  ret i512 %res
+}
+
+define i512 @bitreverse_i512(i512 %a0) nounwind {
+; SSE-LABEL: bitreverse_i512:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; SSE-NEXT:    bswapq %rdi
+; SSE-NEXT:    movq %rdi, %r10
+; SSE-NEXT:    shrq $4, %r10
+; SSE-NEXT:    movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F
+; SSE-NEXT:    andq %r11, %r10
+; SSE-NEXT:    andq %r11, %rdi
+; SSE-NEXT:    shlq $4, %rdi
+; SSE-NEXT:    orq %r10, %rdi
+; SSE-NEXT:    movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333
+; SSE-NEXT:    movq %rdi, %rbx
+; SSE-NEXT:    andq %r10, %rbx
+; SSE-NEXT:    shrq $2, %rdi
+; SSE-NEXT:    andq %r10, %rdi
+; SSE-NEXT:    leaq (%rdi,%rbx,4), %rdi
+; SSE-NEXT:    movabsq $6148914691236517205, %rbx # imm = 0x5555555555555555
+; SSE-NEXT:    movq %rdi, %r12
+; SSE-NEXT:    andq %rbx, %r12
+; SSE-NEXT:    shrq %rdi
+; SSE-NEXT:    andq %rbx, %rdi
+; SSE-NEXT:    leaq (%rdi,%r12,2), %rdi
+; SSE-NEXT:    bswapq %r14
+; SSE-NEXT:    movq %r14, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %r11, %r12
+; SSE-NEXT:    andq %r11, %r14
+; SSE-NEXT:    shlq $4, %r14
+; SSE-NEXT:    orq %r12, %r14
+; SSE-NEXT:    movq %r14, %r12
+; SSE-NEXT:    andq %r10, %r12
+; SSE-NEXT:    shrq $2, %r14
+; SSE-NEXT:    andq %r10, %r14
+; SSE-NEXT:    leaq (%r14,%r12,4), %r14
+; SSE-NEXT:    movq %r14, %r12
+; SSE-NEXT:    andq %rbx, %r12
+; SSE-NEXT:    shrq %r14
+; SSE-NEXT:    andq %rbx, %r14
+; SSE-NEXT:    bswapq %r15
+; SSE-NEXT:    leaq (%r14,%r12,2), %r14
+; SSE-NEXT:    movq %r15, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %r11, %r12
+; SSE-NEXT:    andq %r11, %r15
+; SSE-NEXT:    shlq $4, %r15
+; SSE-NEXT:    orq %r12, %r15
+; SSE-NEXT:    movq %r15, %r12
+; SSE-NEXT:    andq %r10, %r12
+; SSE-NEXT:    shrq $2, %r15
+; SSE-NEXT:    andq %r10, %r15
+; SSE-NEXT:    leaq (%r15,%r12,4), %r15
+; SSE-NEXT:    movq %r15, %r12
+; SSE-NEXT:    andq %rbx, %r12
+; SSE-NEXT:    shrq %r15
+; SSE-NEXT:    andq %rbx, %r15
+; SSE-NEXT:    leaq (%r15,%r12,2), %r15
+; SSE-NEXT:    bswapq %r9
+; SSE-NEXT:    movq %r9, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %r11, %r12
+; SSE-NEXT:    andq %r11, %r9
+; SSE-NEXT:    shlq $4, %r9
+; SSE-NEXT:    orq %r12, %r9
+; SSE-NEXT:    movq %r9, %r12
+; SSE-NEXT:    andq %r10, %r12
+; SSE-NEXT:    shrq $2, %r9
+; SSE-NEXT:    andq %r10, %r9
+; SSE-NEXT:    leaq (%r9,%r12,4), %r9
+; SSE-NEXT:    movq %r9, %r12
+; SSE-NEXT:    andq %rbx, %r12
+; SSE-NEXT:    shrq %r9
+; SSE-NEXT:    andq %rbx, %r9
+; SSE-NEXT:    bswapq %r8
+; SSE-NEXT:    leaq (%r9,%r12,2), %r9
+; SSE-NEXT:    movq %r8, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %r11, %r12
+; SSE-NEXT:    andq %r11, %r8
+; SSE-NEXT:    shlq $4, %r8
+; SSE-NEXT:    orq %r12, %r8
+; SSE-NEXT:    movq %r8, %r12
+; SSE-NEXT:    andq %r10, %r12
+; SSE-NEXT:    shrq $2, %r8
+; SSE-NEXT:    andq %r10, %r8
+; SSE-NEXT:    leaq (%r8,%r12,4), %r8
+; SSE-NEXT:    movq %r8, %r12
+; SSE-NEXT:    andq %rbx, %r12
+; SSE-NEXT:    shrq %r8
+; SSE-NEXT:    andq %rbx, %r8
+; SSE-NEXT:    leaq (%r8,%r12,2), %r8
+; SSE-NEXT:    bswapq %rcx
+; SSE-NEXT:    movq %rcx, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %r11, %r12
+; SSE-NEXT:    andq %r11, %rcx
+; SSE-NEXT:    shlq $4, %rcx
+; SSE-NEXT:    orq %r12, %rcx
+; SSE-NEXT:    movq %rcx, %r12
+; SSE-NEXT:    andq %r10, %r12
+; SSE-NEXT:    shrq $2, %rcx
+; SSE-NEXT:    andq %r10, %rcx
+; SSE-NEXT:    leaq (%rcx,%r12,4), %rcx
+; SSE-NEXT:    movq %rcx, %r12
+; SSE-NEXT:    andq %rbx, %r12
+; SSE-NEXT:    shrq %rcx
+; SSE-NEXT:    andq %rbx, %rcx
+; SSE-NEXT:    bswapq %rdx
+; SSE-NEXT:    leaq (%rcx,%r12,2), %rcx
+; SSE-NEXT:    movq %rdx, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %r11, %r12
+; SSE-NEXT:    andq %r11, %rdx
+; SSE-NEXT:    shlq $4, %rdx
+; SSE-NEXT:    orq %r12, %rdx
+; SSE-NEXT:    movq %rdx, %r12
+; SSE-NEXT:    andq %r10, %r12
+; SSE-NEXT:    shrq $2, %rdx
+; SSE-NEXT:    andq %r10, %rdx
+; SSE-NEXT:    leaq (%rdx,%r12,4), %rdx
+; SSE-NEXT:    movq %rdx, %r12
+; SSE-NEXT:    andq %rbx, %r12
+; SSE-NEXT:    shrq %rdx
+; SSE-NEXT:    andq %rbx, %rdx
+; SSE-NEXT:    leaq (%rdx,%r12,2), %rdx
+; SSE-NEXT:    bswapq %rsi
+; SSE-NEXT:    movq %rsi, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %r11, %r12
+; SSE-NEXT:    andq %r11, %rsi
+; SSE-NEXT:    shlq $4, %rsi
+; SSE-NEXT:    orq %r12, %rsi
+; SSE-NEXT:    movq %rsi, %r11
+; SSE-NEXT:    andq %r10, %r11
+; SSE-NEXT:    shrq $2, %rsi
+; SSE-NEXT:    andq %r10, %rsi
+; SSE-NEXT:    leaq (%rsi,%r11,4), %rsi
+; SSE-NEXT:    movq %rsi, %r10
+; SSE-NEXT:    andq %rbx, %r10
+; SSE-NEXT:    shrq %rsi
+; SSE-NEXT:    andq %rbx, %rsi
+; SSE-NEXT:    leaq (%rsi,%r10,2), %rsi
+; SSE-NEXT:    movq %rsi, 56(%rax)
+; SSE-NEXT:    movq %rdx, 48(%rax)
+; SSE-NEXT:    movq %rcx, 40(%rax)
+; SSE-NEXT:    movq %r8, 32(%rax)
+; SSE-NEXT:    movq %r9, 24(%rax)
+; SSE-NEXT:    movq %r15, 16(%rax)
+; SSE-NEXT:    movq %r14, 8(%rax)
+; SSE-NEXT:    movq %rdi, (%rax)
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: bitreverse_i512:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    bswapq %r9
+; AVX2-NEXT:    movq %r9, %rdi
+; AVX2-NEXT:    shrq $4, %rdi
+; AVX2-NEXT:    movabsq $1085102592571150095, %r10 # imm = 0xF0F0F0F0F0F0F0F
+; AVX2-NEXT:    andq %r10, %rdi
+; AVX2-NEXT:    andq %r10, %r9
+; AVX2-NEXT:    shlq $4, %r9
+; AVX2-NEXT:    orq %rdi, %r9
+; AVX2-NEXT:    movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; AVX2-NEXT:    movq %r9, %r11
+; AVX2-NEXT:    andq %rdi, %r11
+; AVX2-NEXT:    shrq $2, %r9
+; AVX2-NEXT:    andq %rdi, %r9
+; AVX2-NEXT:    leaq (%r9,%r11,4), %r11
+; AVX2-NEXT:    movabsq $6148914691236517205, %r9 # imm = 0x5555555555555555
+; AVX2-NEXT:    movq %r11, %rbx
+; AVX2-NEXT:    andq %r9, %rbx
+; AVX2-NEXT:    shrq %r11
+; AVX2-NEXT:    andq %r9, %r11
+; AVX2-NEXT:    bswapq %r8
+; AVX2-NEXT:    leaq (%r11,%rbx,2), %r11
+; AVX2-NEXT:    movq %r8, %rbx
+; AVX2-NEXT:    shrq $4, %rbx
+; AVX2-NEXT:    andq %r10, %rbx
+; AVX2-NEXT:    andq %r10, %r8
+; AVX2-NEXT:    shlq $4, %r8
+; AVX2-NEXT:    orq %rbx, %r8
+; AVX2-NEXT:    movq %r8, %rbx
+; AVX2-NEXT:    andq %rdi, %rbx
+; AVX2-NEXT:    shrq $2, %r8
+; AVX2-NEXT:    andq %rdi, %r8
+; AVX2-NEXT:    leaq (%r8,%rbx,4), %r8
+; AVX2-NEXT:    movq %r8, %rbx
+; AVX2-NEXT:    andq %r9, %rbx
+; AVX2-NEXT:    shrq %r8
+; AVX2-NEXT:    andq %r9, %r8
+; AVX2-NEXT:    leaq (%r8,%rbx,2), %r8
+; AVX2-NEXT:    bswapq %rcx
+; AVX2-NEXT:    movq %rcx, %rbx
+; AVX2-NEXT:    shrq $4, %rbx
+; AVX2-NEXT:    andq %r10, %rbx
+; AVX2-NEXT:    andq %r10, %rcx
+; AVX2-NEXT:    shlq $4, %rcx
+; AVX2-NEXT:    orq %rbx, %rcx
+; AVX2-NEXT:    movq %rcx, %rbx
+; AVX2-NEXT:    andq %rdi, %rbx
+; AVX2-NEXT:    shrq $2, %rcx
+; AVX2-NEXT:    andq %rdi, %rcx
+; AVX2-NEXT:    leaq (%rcx,%rbx,4), %rcx
+; AVX2-NEXT:    movq %rcx, %rbx
+; AVX2-NEXT:    andq %r9, %rbx
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andq %r9, %rcx
+; AVX2-NEXT:    leaq (%rcx,%rbx,2), %rcx
+; AVX2-NEXT:    bswapq %rdx
+; AVX2-NEXT:    movq %rdx, %rbx
+; AVX2-NEXT:    shrq $4, %rbx
+; AVX2-NEXT:    andq %r10, %rbx
+; AVX2-NEXT:    andq %r10, %rdx
+; AVX2-NEXT:    shlq $4, %rdx
+; AVX2-NEXT:    orq %rbx, %rdx
+; AVX2-NEXT:    movq %rdx, %rbx
+; AVX2-NEXT:    andq %rdi, %rbx
+; AVX2-NEXT:    shrq $2, %rdx
+; AVX2-NEXT:    andq %rdi, %rdx
+; AVX2-NEXT:    leaq (%rdx,%rbx,4), %rdx
+; AVX2-NEXT:    movq %rdx, %rbx
+; AVX2-NEXT:    andq %r9, %rbx
+; AVX2-NEXT:    shrq %rdx
+; AVX2-NEXT:    andq %r9, %rdx
+; AVX2-NEXT:    leaq (%rdx,%rbx,2), %rdx
+; AVX2-NEXT:    bswapq %rsi
+; AVX2-NEXT:    movq %rsi, %rbx
+; AVX2-NEXT:    shrq $4, %rbx
+; AVX2-NEXT:    andq %r10, %rbx
+; AVX2-NEXT:    andq %r10, %rsi
+; AVX2-NEXT:    shlq $4, %rsi
+; AVX2-NEXT:    orq %rbx, %rsi
+; AVX2-NEXT:    movq %rsi, %rbx
+; AVX2-NEXT:    andq %rdi, %rbx
+; AVX2-NEXT:    shrq $2, %rsi
+; AVX2-NEXT:    andq %rdi, %rsi
+; AVX2-NEXT:    leaq (%rsi,%rbx,4), %rsi
+; AVX2-NEXT:    movq %rsi, %rbx
+; AVX2-NEXT:    andq %r9, %rbx
+; AVX2-NEXT:    shrq %rsi
+; AVX2-NEXT:    andq %r9, %rsi
+; AVX2-NEXT:    movbeq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    leaq (%rsi,%rbx,2), %rsi
+; AVX2-NEXT:    movq %r14, %rbx
+; AVX2-NEXT:    shrq $4, %rbx
+; AVX2-NEXT:    andq %r10, %rbx
+; AVX2-NEXT:    andq %r10, %r14
+; AVX2-NEXT:    shlq $4, %r14
+; AVX2-NEXT:    orq %rbx, %r14
+; AVX2-NEXT:    movq %r14, %rbx
+; AVX2-NEXT:    andq %rdi, %rbx
+; AVX2-NEXT:    shrq $2, %r14
+; AVX2-NEXT:    andq %rdi, %r14
+; AVX2-NEXT:    leaq (%r14,%rbx,4), %rbx
+; AVX2-NEXT:    movq %rbx, %r14
+; AVX2-NEXT:    andq %r9, %r14
+; AVX2-NEXT:    shrq %rbx
+; AVX2-NEXT:    andq %r9, %rbx
+; AVX2-NEXT:    leaq (%rbx,%r14,2), %rbx
+; AVX2-NEXT:    movbeq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    movq %r14, %r15
+; AVX2-NEXT:    shrq $4, %r15
+; AVX2-NEXT:    andq %r10, %r15
+; AVX2-NEXT:    andq %r10, %r14
+; AVX2-NEXT:    shlq $4, %r14
+; AVX2-NEXT:    orq %r15, %r14
+; AVX2-NEXT:    movq %r14, %r15
+; AVX2-NEXT:    andq %rdi, %r15
+; AVX2-NEXT:    shrq $2, %r14
+; AVX2-NEXT:    andq %rdi, %r14
+; AVX2-NEXT:    leaq (%r14,%r15,4), %r14
+; AVX2-NEXT:    movq %r14, %r15
+; AVX2-NEXT:    andq %r9, %r15
+; AVX2-NEXT:    shrq %r14
+; AVX2-NEXT:    andq %r9, %r14
+; AVX2-NEXT:    leaq (%r14,%r15,2), %r14
+; AVX2-NEXT:    movbeq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    movq %r15, %r12
+; AVX2-NEXT:    shrq $4, %r12
+; AVX2-NEXT:    andq %r10, %r12
+; AVX2-NEXT:    andq %r10, %r15
+; AVX2-NEXT:    shlq $4, %r15
+; AVX2-NEXT:    orq %r12, %r15
+; AVX2-NEXT:    movq %r15, %r10
+; AVX2-NEXT:    andq %rdi, %r10
+; AVX2-NEXT:    shrq $2, %r15
+; AVX2-NEXT:    andq %rdi, %r15
+; AVX2-NEXT:    leaq (%r15,%r10,4), %rdi
+; AVX2-NEXT:    movq %rdi, %r10
+; AVX2-NEXT:    andq %r9, %r10
+; AVX2-NEXT:    shrq %rdi
+; AVX2-NEXT:    andq %r9, %rdi
+; AVX2-NEXT:    leaq (%rdi,%r10,2), %rdi
+; AVX2-NEXT:    movq %rdi, 16(%rax)
+; AVX2-NEXT:    movq %r14, 8(%rax)
+; AVX2-NEXT:    movq %rbx, (%rax)
+; AVX2-NEXT:    movq %rsi, 56(%rax)
+; AVX2-NEXT:    movq %rdx, 48(%rax)
+; AVX2-NEXT:    movq %rcx, 40(%rax)
+; AVX2-NEXT:    movq %r8, 32(%rax)
+; AVX2-NEXT:    movq %r11, 24(%rax)
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitreverse_i512:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %r15
+; AVX512F-NEXT:    pushq %r14
+; AVX512F-NEXT:    pushq %r12
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    bswapq %r9
+; AVX512F-NEXT:    movq %r9, %rdi
+; AVX512F-NEXT:    shrq $4, %rdi
+; AVX512F-NEXT:    movabsq $1085102592571150095, %r10 # imm = 0xF0F0F0F0F0F0F0F
+; AVX512F-NEXT:    andq %r10, %rdi
+; AVX512F-NEXT:    andq %r10, %r9
+; AVX512F-NEXT:    shlq $4, %r9
+; AVX512F-NEXT:    orq %rdi, %r9
+; AVX512F-NEXT:    movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; AVX512F-NEXT:    movq %r9, %r11
+; AVX512F-NEXT:    andq %rdi, %r11
+; AVX512F-NEXT:    shrq $2, %r9
+; AVX512F-NEXT:    andq %rdi, %r9
+; AVX512F-NEXT:    leaq (%r9,%r11,4), %r11
+; AVX512F-NEXT:    movabsq $6148914691236517205, %r9 # imm = 0x5555555555555555
+; AVX512F-NEXT:    movq %r11, %rbx
+; AVX512F-NEXT:    andq %r9, %rbx
+; AVX512F-NEXT:    shrq %r11
+; AVX512F-NEXT:    andq %r9, %r11
+; AVX512F-NEXT:    bswapq %r8
+; AVX512F-NEXT:    leaq (%r11,%rbx,2), %r11
+; AVX512F-NEXT:    movq %r8, %rbx
+; AVX512F-NEXT:    shrq $4, %rbx
+; AVX512F-NEXT:    andq %r10, %rbx
+; AVX512F-NEXT:    andq %r10, %r8
+; AVX512F-NEXT:    shlq $4, %r8
+; AVX512F-NEXT:    orq %rbx, %r8
+; AVX512F-NEXT:    movq %r8, %rbx
+; AVX512F-NEXT:    andq %rdi, %rbx
+; AVX512F-NEXT:    shrq $2, %r8
+; AVX512F-NEXT:    andq %rdi, %r8
+; AVX512F-NEXT:    leaq (%r8,%rbx,4), %r8
+; AVX512F-NEXT:    movq %r8, %rbx
+; AVX512F-NEXT:    andq %r9, %rbx
+; AVX512F-NEXT:    shrq %r8
+; AVX512F-NEXT:    andq %r9, %r8
+; AVX512F-NEXT:    leaq (%r8,%rbx,2), %r8
+; AVX512F-NEXT:    bswapq %rcx
+; AVX512F-NEXT:    movq %rcx, %rbx
+; AVX512F-NEXT:    shrq $4, %rbx
+; AVX512F-NEXT:    andq %r10, %rbx
+; AVX512F-NEXT:    andq %r10, %rcx
+; AVX512F-NEXT:    shlq $4, %rcx
+; AVX512F-NEXT:    orq %rbx, %rcx
+; AVX512F-NEXT:    movq %rcx, %rbx
+; AVX512F-NEXT:    andq %rdi, %rbx
+; AVX512F-NEXT:    shrq $2, %rcx
+; AVX512F-NEXT:    andq %rdi, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%rbx,4), %rcx
+; AVX512F-NEXT:    movq %rcx, %rbx
+; AVX512F-NEXT:    andq %r9, %rbx
+; AVX512F-NEXT:    shrq %rcx
+; AVX512F-NEXT:    andq %r9, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%rbx,2), %rcx
+; AVX512F-NEXT:    bswapq %rdx
+; AVX512F-NEXT:    movq %rdx, %rbx
+; AVX512F-NEXT:    shrq $4, %rbx
+; AVX512F-NEXT:    andq %r10, %rbx
+; AVX512F-NEXT:    andq %r10, %rdx
+; AVX512F-NEXT:    shlq $4, %rdx
+; AVX512F-NEXT:    orq %rbx, %rdx
+; AVX512F-NEXT:    movq %rdx, %rbx
+; AVX512F-NEXT:    andq %rdi, %rbx
+; AVX512F-NEXT:    shrq $2, %rdx
+; AVX512F-NEXT:    andq %rdi, %rdx
+; AVX512F-NEXT:    leaq (%rdx,%rbx,4), %rdx
+; AVX512F-NEXT:    movq %rdx, %rbx
+; AVX512F-NEXT:    andq %r9, %rbx
+; AVX512F-NEXT:    shrq %rdx
+; AVX512F-NEXT:    andq %r9, %rdx
+; AVX512F-NEXT:    leaq (%rdx,%rbx,2), %rdx
+; AVX512F-NEXT:    bswapq %rsi
+; AVX512F-NEXT:    movq %rsi, %rbx
+; AVX512F-NEXT:    shrq $4, %rbx
+; AVX512F-NEXT:    andq %r10, %rbx
+; AVX512F-NEXT:    andq %r10, %rsi
+; AVX512F-NEXT:    shlq $4, %rsi
+; AVX512F-NEXT:    orq %rbx, %rsi
+; AVX512F-NEXT:    movq %rsi, %rbx
+; AVX512F-NEXT:    andq %rdi, %rbx
+; AVX512F-NEXT:    shrq $2, %rsi
+; AVX512F-NEXT:    andq %rdi, %rsi
+; AVX512F-NEXT:    leaq (%rsi,%rbx,4), %rsi
+; AVX512F-NEXT:    movq %rsi, %rbx
+; AVX512F-NEXT:    andq %r9, %rbx
+; AVX512F-NEXT:    shrq %rsi
+; AVX512F-NEXT:    andq %r9, %rsi
+; AVX512F-NEXT:    movbeq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT:    leaq (%rsi,%rbx,2), %rsi
+; AVX512F-NEXT:    movq %r14, %rbx
+; AVX512F-NEXT:    shrq $4, %rbx
+; AVX512F-NEXT:    andq %r10, %rbx
+; AVX512F-NEXT:    andq %r10, %r14
+; AVX512F-NEXT:    shlq $4, %r14
+; AVX512F-NEXT:    orq %rbx, %r14
+; AVX512F-NEXT:    movq %r14, %rbx
+; AVX512F-NEXT:    andq %rdi, %rbx
+; AVX512F-NEXT:    shrq $2, %r14
+; AVX512F-NEXT:    andq %rdi, %r14
+; AVX512F-NEXT:    leaq (%r14,%rbx,4), %rbx
+; AVX512F-NEXT:    movq %rbx, %r14
+; AVX512F-NEXT:    andq %r9, %r14
+; AVX512F-NEXT:    shrq %rbx
+; AVX512F-NEXT:    andq %r9, %rbx
+; AVX512F-NEXT:    leaq (%rbx,%r14,2), %rbx
+; AVX512F-NEXT:    movbeq {{[0-9]+}}(%rsp), %r14
+; AVX512F-NEXT:    movq %r14, %r15
+; AVX512F-NEXT:    shrq $4, %r15
+; AVX512F-NEXT:    andq %r10, %r15
+; AVX512F-NEXT:    andq %r10, %r14
+; AVX512F-NEXT:    shlq $4, %r14
+; AVX512F-NEXT:    orq %r15, %r14
+; AVX512F-NEXT:    movq %r14, %r15
+; AVX512F-NEXT:    andq %rdi, %r15
+; AVX512F-NEXT:    shrq $2, %r14
+; AVX512F-NEXT:    andq %rdi, %r14
+; AVX512F-NEXT:    leaq (%r14,%r15,4), %r14
+; AVX512F-NEXT:    movq %r14, %r15
+; AVX512F-NEXT:    andq %r9, %r15
+; AVX512F-NEXT:    shrq %r14
+; AVX512F-NEXT:    andq %r9, %r14
+; AVX512F-NEXT:    leaq (%r14,%r15,2), %r14
+; AVX512F-NEXT:    movbeq {{[0-9]+}}(%rsp), %r15
+; AVX512F-NEXT:    movq %r15, %r12
+; AVX512F-NEXT:    shrq $4, %r12
+; AVX512F-NEXT:    andq %r10, %r12
+; AVX512F-NEXT:    andq %r10, %r15
+; AVX512F-NEXT:    shlq $4, %r15
+; AVX512F-NEXT:    orq %r12, %r15
+; AVX512F-NEXT:    movq %r15, %r10
+; AVX512F-NEXT:    andq %rdi, %r10
+; AVX512F-NEXT:    shrq $2, %r15
+; AVX512F-NEXT:    andq %rdi, %r15
+; AVX512F-NEXT:    leaq (%r15,%r10,4), %rdi
+; AVX512F-NEXT:    movq %rdi, %r10
+; AVX512F-NEXT:    andq %r9, %r10
+; AVX512F-NEXT:    shrq %rdi
+; AVX512F-NEXT:    andq %r9, %rdi
+; AVX512F-NEXT:    leaq (%rdi,%r10,2), %rdi
+; AVX512F-NEXT:    movq %rdi, 16(%rax)
+; AVX512F-NEXT:    movq %r14, 8(%rax)
+; AVX512F-NEXT:    movq %rbx, (%rax)
+; AVX512F-NEXT:    movq %rsi, 56(%rax)
+; AVX512F-NEXT:    movq %rdx, 48(%rax)
+; AVX512F-NEXT:    movq %rcx, 40(%rax)
+; AVX512F-NEXT:    movq %r8, 32(%rax)
+; AVX512F-NEXT:    movq %r11, 24(%rax)
+; AVX512F-NEXT:    popq %rbx
+; AVX512F-NEXT:    popq %r12
+; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: bitreverse_i512:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %r15
+; AVX512VL-NEXT:    pushq %r14
+; AVX512VL-NEXT:    pushq %r12
+; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    bswapq %r9
+; AVX512VL-NEXT:    movq %r9, %rdi
+; AVX512VL-NEXT:    shrq $4, %rdi
+; AVX512VL-NEXT:    movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VL-NEXT:    andq %r11, %rdi
+; AVX512VL-NEXT:    andq %r11, %r9
+; AVX512VL-NEXT:    shlq $4, %r9
+; AVX512VL-NEXT:    orq %rdi, %r9
+; AVX512VL-NEXT:    movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333
+; AVX512VL-NEXT:    movq %r9, %rdi
+; AVX512VL-NEXT:    andq %r10, %rdi
+; AVX512VL-NEXT:    shrq $2, %r9
+; AVX512VL-NEXT:    andq %r10, %r9
+; AVX512VL-NEXT:    leaq (%r9,%rdi,4), %rdi
+; AVX512VL-NEXT:    movabsq $6148914691236517205, %r9 # imm = 0x5555555555555555
+; AVX512VL-NEXT:    movq %rdi, %rbx
+; AVX512VL-NEXT:    andq %r9, %rbx
+; AVX512VL-NEXT:    shrq %rdi
+; AVX512VL-NEXT:    andq %r9, %rdi
+; AVX512VL-NEXT:    leaq (%rdi,%rbx,2), %rdi
+; AVX512VL-NEXT:    bswapq %r8
+; AVX512VL-NEXT:    movq %r8, %rbx
+; AVX512VL-NEXT:    shrq $4, %rbx
+; AVX512VL-NEXT:    andq %r11, %rbx
+; AVX512VL-NEXT:    andq %r11, %r8
+; AVX512VL-NEXT:    shlq $4, %r8
+; AVX512VL-NEXT:    orq %rbx, %r8
+; AVX512VL-NEXT:    movq %r8, %rbx
+; AVX512VL-NEXT:    andq %r10, %rbx
+; AVX512VL-NEXT:    shrq $2, %r8
+; AVX512VL-NEXT:    andq %r10, %r8
+; AVX512VL-NEXT:    leaq (%r8,%rbx,4), %r8
+; AVX512VL-NEXT:    movq %r8, %rbx
+; AVX512VL-NEXT:    andq %r9, %rbx
+; AVX512VL-NEXT:    shrq %r8
+; AVX512VL-NEXT:    andq %r9, %r8
+; AVX512VL-NEXT:    leaq (%r8,%rbx,2), %r8
+; AVX512VL-NEXT:    bswapq %rcx
+; AVX512VL-NEXT:    movq %rcx, %rbx
+; AVX512VL-NEXT:    shrq $4, %rbx
+; AVX512VL-NEXT:    andq %r11, %rbx
+; AVX512VL-NEXT:    andq %r11, %rcx
+; AVX512VL-NEXT:    shlq $4, %rcx
+; AVX512VL-NEXT:    orq %rbx, %rcx
+; AVX512VL-NEXT:    movq %rcx, %rbx
+; AVX512VL-NEXT:    andq %r10, %rbx
+; AVX512VL-NEXT:    shrq $2, %rcx
+; AVX512VL-NEXT:    andq %r10, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%rbx,4), %rcx
+; AVX512VL-NEXT:    movq %rcx, %rbx
+; AVX512VL-NEXT:    andq %r9, %rbx
+; AVX512VL-NEXT:    shrq %rcx
+; AVX512VL-NEXT:    andq %r9, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%rbx,2), %rcx
+; AVX512VL-NEXT:    bswapq %rdx
+; AVX512VL-NEXT:    movq %rdx, %rbx
+; AVX512VL-NEXT:    shrq $4, %rbx
+; AVX512VL-NEXT:    andq %r11, %rbx
+; AVX512VL-NEXT:    andq %r11, %rdx
+; AVX512VL-NEXT:    shlq $4, %rdx
+; AVX512VL-NEXT:    orq %rbx, %rdx
+; AVX512VL-NEXT:    movq %rdx, %rbx
+; AVX512VL-NEXT:    andq %r10, %rbx
+; AVX512VL-NEXT:    shrq $2, %rdx
+; AVX512VL-NEXT:    andq %r10, %rdx
+; AVX512VL-NEXT:    leaq (%rdx,%rbx,4), %rdx
+; AVX512VL-NEXT:    movq %rdx, %rbx
+; AVX512VL-NEXT:    andq %r9, %rbx
+; AVX512VL-NEXT:    shrq %rdx
+; AVX512VL-NEXT:    andq %r9, %rdx
+; AVX512VL-NEXT:    leaq (%rdx,%rbx,2), %rdx
+; AVX512VL-NEXT:    bswapq %rsi
+; AVX512VL-NEXT:    movq %rsi, %rbx
+; AVX512VL-NEXT:    shrq $4, %rbx
+; AVX512VL-NEXT:    andq %r11, %rbx
+; AVX512VL-NEXT:    andq %r11, %rsi
+; AVX512VL-NEXT:    shlq $4, %rsi
+; AVX512VL-NEXT:    orq %rbx, %rsi
+; AVX512VL-NEXT:    movq %rsi, %rbx
+; AVX512VL-NEXT:    andq %r10, %rbx
+; AVX512VL-NEXT:    shrq $2, %rsi
+; AVX512VL-NEXT:    andq %r10, %rsi
+; AVX512VL-NEXT:    leaq (%rsi,%rbx,4), %rsi
+; AVX512VL-NEXT:    movq %rsi, %rbx
+; AVX512VL-NEXT:    andq %r9, %rbx
+; AVX512VL-NEXT:    shrq %rsi
+; AVX512VL-NEXT:    andq %r9, %rsi
+; AVX512VL-NEXT:    leaq (%rsi,%rbx,2), %rsi
+; AVX512VL-NEXT:    movbeq {{[0-9]+}}(%rsp), %rbx
+; AVX512VL-NEXT:    movq %rbx, %r14
+; AVX512VL-NEXT:    shrq $4, %r14
+; AVX512VL-NEXT:    andq %r11, %r14
+; AVX512VL-NEXT:    andq %r11, %rbx
+; AVX512VL-NEXT:    shlq $4, %rbx
+; AVX512VL-NEXT:    orq %r14, %rbx
+; AVX512VL-NEXT:    movq %rbx, %r14
+; AVX512VL-NEXT:    andq %r10, %r14
+; AVX512VL-NEXT:    shrq $2, %rbx
+; AVX512VL-NEXT:    andq %r10, %rbx
+; AVX512VL-NEXT:    leaq (%rbx,%r14,4), %rbx
+; AVX512VL-NEXT:    movq %rbx, %r14
+; AVX512VL-NEXT:    andq %r9, %r14
+; AVX512VL-NEXT:    shrq %rbx
+; AVX512VL-NEXT:    andq %r9, %rbx
+; AVX512VL-NEXT:    leaq (%rbx,%r14,2), %rbx
+; AVX512VL-NEXT:    movbeq {{[0-9]+}}(%rsp), %r14
+; AVX512VL-NEXT:    movq %r14, %r15
+; AVX512VL-NEXT:    shrq $4, %r15
+; AVX512VL-NEXT:    andq %r11, %r15
+; AVX512VL-NEXT:    andq %r11, %r14
+; AVX512VL-NEXT:    shlq $4, %r14
+; AVX512VL-NEXT:    orq %r15, %r14
+; AVX512VL-NEXT:    movq %r14, %r15
+; AVX512VL-NEXT:    andq %r10, %r15
+; AVX512VL-NEXT:    shrq $2, %r14
+; AVX512VL-NEXT:    andq %r10, %r14
+; AVX512VL-NEXT:    leaq (%r14,%r15,4), %r14
+; AVX512VL-NEXT:    movq %r14, %r15
+; AVX512VL-NEXT:    andq %r9, %r15
+; AVX512VL-NEXT:    shrq %r14
+; AVX512VL-NEXT:    andq %r9, %r14
+; AVX512VL-NEXT:    leaq (%r14,%r15,2), %r14
+; AVX512VL-NEXT:    movbeq {{[0-9]+}}(%rsp), %r15
+; AVX512VL-NEXT:    movq %r15, %r12
+; AVX512VL-NEXT:    shrq $4, %r12
+; AVX512VL-NEXT:    andq %r11, %r12
+; AVX512VL-NEXT:    andq %r11, %r15
+; AVX512VL-NEXT:    shlq $4, %r15
+; AVX512VL-NEXT:    orq %r12, %r15
+; AVX512VL-NEXT:    movq %r15, %r11
+; AVX512VL-NEXT:    andq %r10, %r11
+; AVX512VL-NEXT:    shrq $2, %r15
+; AVX512VL-NEXT:    andq %r10, %r15
+; AVX512VL-NEXT:    leaq (%r15,%r11,4), %r10
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    andq %r9, %r11
+; AVX512VL-NEXT:    shrq %r10
+; AVX512VL-NEXT:    andq %r9, %r10
+; AVX512VL-NEXT:    leaq (%r10,%r11,2), %r9
+; AVX512VL-NEXT:    movq %r9, 16(%rax)
+; AVX512VL-NEXT:    movq %r14, 8(%rax)
+; AVX512VL-NEXT:    movq %rbx, (%rax)
+; AVX512VL-NEXT:    movq %rsi, 56(%rax)
+; AVX512VL-NEXT:    movq %rdx, 48(%rax)
+; AVX512VL-NEXT:    movq %rcx, 40(%rax)
+; AVX512VL-NEXT:    movq %r8, 32(%rax)
+; AVX512VL-NEXT:    movq %rdi, 24(%rax)
+; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    popq %r12
+; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bitreverse_i512:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %r15
+; AVX512VBMI-NEXT:    pushq %r14
+; AVX512VBMI-NEXT:    pushq %r12
+; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    bswapq %r9
+; AVX512VBMI-NEXT:    movq %r9, %rdi
+; AVX512VBMI-NEXT:    shrq $4, %rdi
+; AVX512VBMI-NEXT:    movabsq $1085102592571150095, %r11 # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VBMI-NEXT:    andq %r11, %rdi
+; AVX512VBMI-NEXT:    andq %r11, %r9
+; AVX512VBMI-NEXT:    shlq $4, %r9
+; AVX512VBMI-NEXT:    orq %rdi, %r9
+; AVX512VBMI-NEXT:    movabsq $3689348814741910323, %r10 # imm = 0x3333333333333333
+; AVX512VBMI-NEXT:    movq %r9, %rdi
+; AVX512VBMI-NEXT:    andq %r10, %rdi
+; AVX512VBMI-NEXT:    shrq $2, %r9
+; AVX512VBMI-NEXT:    andq %r10, %r9
+; AVX512VBMI-NEXT:    leaq (%r9,%rdi,4), %rdi
+; AVX512VBMI-NEXT:    movabsq $6148914691236517205, %r9 # imm = 0x5555555555555555
+; AVX512VBMI-NEXT:    movq %rdi, %rbx
+; AVX512VBMI-NEXT:    andq %r9, %rbx
+; AVX512VBMI-NEXT:    shrq %rdi
+; AVX512VBMI-NEXT:    andq %r9, %rdi
+; AVX512VBMI-NEXT:    leaq (%rdi,%rbx,2), %rdi
+; AVX512VBMI-NEXT:    bswapq %r8
+; AVX512VBMI-NEXT:    movq %r8, %rbx
+; AVX512VBMI-NEXT:    shrq $4, %rbx
+; AVX512VBMI-NEXT:    andq %r11, %rbx
+; AVX512VBMI-NEXT:    andq %r11, %r8
+; AVX512VBMI-NEXT:    shlq $4, %r8
+; AVX512VBMI-NEXT:    orq %rbx, %r8
+; AVX512VBMI-NEXT:    movq %r8, %rbx
+; AVX512VBMI-NEXT:    andq %r10, %rbx
+; AVX512VBMI-NEXT:    shrq $2, %r8
+; AVX512VBMI-NEXT:    andq %r10, %r8
+; AVX512VBMI-NEXT:    leaq (%r8,%rbx,4), %r8
+; AVX512VBMI-NEXT:    movq %r8, %rbx
+; AVX512VBMI-NEXT:    andq %r9, %rbx
+; AVX512VBMI-NEXT:    shrq %r8
+; AVX512VBMI-NEXT:    andq %r9, %r8
+; AVX512VBMI-NEXT:    leaq (%r8,%rbx,2), %r8
+; AVX512VBMI-NEXT:    bswapq %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %rbx
+; AVX512VBMI-NEXT:    shrq $4, %rbx
+; AVX512VBMI-NEXT:    andq %r11, %rbx
+; AVX512VBMI-NEXT:    andq %r11, %rcx
+; AVX512VBMI-NEXT:    shlq $4, %rcx
+; AVX512VBMI-NEXT:    orq %rbx, %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %rbx
+; AVX512VBMI-NEXT:    andq %r10, %rbx
+; AVX512VBMI-NEXT:    shrq $2, %rcx
+; AVX512VBMI-NEXT:    andq %r10, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%rbx,4), %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %rbx
+; AVX512VBMI-NEXT:    andq %r9, %rbx
+; AVX512VBMI-NEXT:    shrq %rcx
+; AVX512VBMI-NEXT:    andq %r9, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%rbx,2), %rcx
+; AVX512VBMI-NEXT:    bswapq %rdx
+; AVX512VBMI-NEXT:    movq %rdx, %rbx
+; AVX512VBMI-NEXT:    shrq $4, %rbx
+; AVX512VBMI-NEXT:    andq %r11, %rbx
+; AVX512VBMI-NEXT:    andq %r11, %rdx
+; AVX512VBMI-NEXT:    shlq $4, %rdx
+; AVX512VBMI-NEXT:    orq %rbx, %rdx
+; AVX512VBMI-NEXT:    movq %rdx, %rbx
+; AVX512VBMI-NEXT:    andq %r10, %rbx
+; AVX512VBMI-NEXT:    shrq $2, %rdx
+; AVX512VBMI-NEXT:    andq %r10, %rdx
+; AVX512VBMI-NEXT:    leaq (%rdx,%rbx,4), %rdx
+; AVX512VBMI-NEXT:    movq %rdx, %rbx
+; AVX512VBMI-NEXT:    andq %r9, %rbx
+; AVX512VBMI-NEXT:    shrq %rdx
+; AVX512VBMI-NEXT:    andq %r9, %rdx
+; AVX512VBMI-NEXT:    leaq (%rdx,%rbx,2), %rdx
+; AVX512VBMI-NEXT:    bswapq %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %rbx
+; AVX512VBMI-NEXT:    shrq $4, %rbx
+; AVX512VBMI-NEXT:    andq %r11, %rbx
+; AVX512VBMI-NEXT:    andq %r11, %rsi
+; AVX512VBMI-NEXT:    shlq $4, %rsi
+; AVX512VBMI-NEXT:    orq %rbx, %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %rbx
+; AVX512VBMI-NEXT:    andq %r10, %rbx
+; AVX512VBMI-NEXT:    shrq $2, %rsi
+; AVX512VBMI-NEXT:    andq %r10, %rsi
+; AVX512VBMI-NEXT:    leaq (%rsi,%rbx,4), %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %rbx
+; AVX512VBMI-NEXT:    andq %r9, %rbx
+; AVX512VBMI-NEXT:    shrq %rsi
+; AVX512VBMI-NEXT:    andq %r9, %rsi
+; AVX512VBMI-NEXT:    leaq (%rsi,%rbx,2), %rsi
+; AVX512VBMI-NEXT:    movbeq {{[0-9]+}}(%rsp), %rbx
+; AVX512VBMI-NEXT:    movq %rbx, %r14
+; AVX512VBMI-NEXT:    shrq $4, %r14
+; AVX512VBMI-NEXT:    andq %r11, %r14
+; AVX512VBMI-NEXT:    andq %r11, %rbx
+; AVX512VBMI-NEXT:    shlq $4, %rbx
+; AVX512VBMI-NEXT:    orq %r14, %rbx
+; AVX512VBMI-NEXT:    movq %rbx, %r14
+; AVX512VBMI-NEXT:    andq %r10, %r14
+; AVX512VBMI-NEXT:    shrq $2, %rbx
+; AVX512VBMI-NEXT:    andq %r10, %rbx
+; AVX512VBMI-NEXT:    leaq (%rbx,%r14,4), %rbx
+; AVX512VBMI-NEXT:    movq %rbx, %r14
+; AVX512VBMI-NEXT:    andq %r9, %r14
+; AVX512VBMI-NEXT:    shrq %rbx
+; AVX512VBMI-NEXT:    andq %r9, %rbx
+; AVX512VBMI-NEXT:    leaq (%rbx,%r14,2), %rbx
+; AVX512VBMI-NEXT:    movbeq {{[0-9]+}}(%rsp), %r14
+; AVX512VBMI-NEXT:    movq %r14, %r15
+; AVX512VBMI-NEXT:    shrq $4, %r15
+; AVX512VBMI-NEXT:    andq %r11, %r15
+; AVX512VBMI-NEXT:    andq %r11, %r14
+; AVX512VBMI-NEXT:    shlq $4, %r14
+; AVX512VBMI-NEXT:    orq %r15, %r14
+; AVX512VBMI-NEXT:    movq %r14, %r15
+; AVX512VBMI-NEXT:    andq %r10, %r15
+; AVX512VBMI-NEXT:    shrq $2, %r14
+; AVX512VBMI-NEXT:    andq %r10, %r14
+; AVX512VBMI-NEXT:    leaq (%r14,%r15,4), %r14
+; AVX512VBMI-NEXT:    movq %r14, %r15
+; AVX512VBMI-NEXT:    andq %r9, %r15
+; AVX512VBMI-NEXT:    shrq %r14
+; AVX512VBMI-NEXT:    andq %r9, %r14
+; AVX512VBMI-NEXT:    leaq (%r14,%r15,2), %r14
+; AVX512VBMI-NEXT:    movbeq {{[0-9]+}}(%rsp), %r15
+; AVX512VBMI-NEXT:    movq %r15, %r12
+; AVX512VBMI-NEXT:    shrq $4, %r12
+; AVX512VBMI-NEXT:    andq %r11, %r12
+; AVX512VBMI-NEXT:    andq %r11, %r15
+; AVX512VBMI-NEXT:    shlq $4, %r15
+; AVX512VBMI-NEXT:    orq %r12, %r15
+; AVX512VBMI-NEXT:    movq %r15, %r11
+; AVX512VBMI-NEXT:    andq %r10, %r11
+; AVX512VBMI-NEXT:    shrq $2, %r15
+; AVX512VBMI-NEXT:    andq %r10, %r15
+; AVX512VBMI-NEXT:    leaq (%r15,%r11,4), %r10
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    andq %r9, %r11
+; AVX512VBMI-NEXT:    shrq %r10
+; AVX512VBMI-NEXT:    andq %r9, %r10
+; AVX512VBMI-NEXT:    leaq (%r10,%r11,2), %r9
+; AVX512VBMI-NEXT:    movq %r9, 16(%rax)
+; AVX512VBMI-NEXT:    movq %r14, 8(%rax)
+; AVX512VBMI-NEXT:    movq %rbx, (%rax)
+; AVX512VBMI-NEXT:    movq %rsi, 56(%rax)
+; AVX512VBMI-NEXT:    movq %rdx, 48(%rax)
+; AVX512VBMI-NEXT:    movq %rcx, 40(%rax)
+; AVX512VBMI-NEXT:    movq %r8, 32(%rax)
+; AVX512VBMI-NEXT:    movq %rdi, 24(%rax)
+; AVX512VBMI-NEXT:    popq %rbx
+; AVX512VBMI-NEXT:    popq %r12
+; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    retq
+  %res = call i512 @llvm.bitreverse.i512(i512 %a0)
+  ret i512 %res
+}
+
+define i512 @bitreverse_i512_vector(<8 x i64> %v0) nounwind {
+; SSE2-LABEL: bitreverse_i512_vector:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    movq %xmm0, %rcx
+; SSE2-NEXT:    movq %xmm1, %r9
+; SSE2-NEXT:    movq %xmm2, %r11
+; SSE2-NEXT:    movq %xmm3, %r14
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r15
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rbx
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %r10
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,2,3]
+; SSE2-NEXT:    movq %xmm0, %rdx
+; SSE2-NEXT:    bswapq %rdx
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    shrq $4, %rsi
+; SSE2-NEXT:    movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F
+; SSE2-NEXT:    andq %rdi, %rsi
+; SSE2-NEXT:    andq %rdi, %rdx
+; SSE2-NEXT:    shlq $4, %rdx
+; SSE2-NEXT:    orq %rsi, %rdx
+; SSE2-NEXT:    movabsq $3689348814741910323, %rsi # imm = 0x3333333333333333
+; SSE2-NEXT:    movq %rdx, %r8
+; SSE2-NEXT:    andq %rsi, %r8
+; SSE2-NEXT:    shrq $2, %rdx
+; SSE2-NEXT:    andq %rsi, %rdx
+; SSE2-NEXT:    leaq (%rdx,%r8,4), %rdx
+; SSE2-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; SSE2-NEXT:    movq %rdx, %r12
+; SSE2-NEXT:    andq %r8, %r12
+; SSE2-NEXT:    shrq %rdx
+; SSE2-NEXT:    andq %r8, %rdx
+; SSE2-NEXT:    leaq (%rdx,%r12,2), %rdx
+; SSE2-NEXT:    bswapq %r10
+; SSE2-NEXT:    movq %r10, %r12
+; SSE2-NEXT:    shrq $4, %r12
+; SSE2-NEXT:    andq %rdi, %r12
+; SSE2-NEXT:    andq %rdi, %r10
+; SSE2-NEXT:    shlq $4, %r10
+; SSE2-NEXT:    orq %r12, %r10
+; SSE2-NEXT:    movq %r10, %r12
+; SSE2-NEXT:    andq %rsi, %r12
+; SSE2-NEXT:    shrq $2, %r10
+; SSE2-NEXT:    andq %rsi, %r10
+; SSE2-NEXT:    leaq (%r10,%r12,4), %r10
+; SSE2-NEXT:    movq %r10, %r12
+; SSE2-NEXT:    andq %r8, %r12
+; SSE2-NEXT:    shrq %r10
+; SSE2-NEXT:    andq %r8, %r10
+; SSE2-NEXT:    bswapq %rbx
+; SSE2-NEXT:    leaq (%r10,%r12,2), %r10
+; SSE2-NEXT:    movq %rbx, %r12
+; SSE2-NEXT:    shrq $4, %r12
+; SSE2-NEXT:    andq %rdi, %r12
+; SSE2-NEXT:    andq %rdi, %rbx
+; SSE2-NEXT:    shlq $4, %rbx
+; SSE2-NEXT:    orq %r12, %rbx
+; SSE2-NEXT:    movq %rbx, %r12
+; SSE2-NEXT:    andq %rsi, %r12
+; SSE2-NEXT:    shrq $2, %rbx
+; SSE2-NEXT:    andq %rsi, %rbx
+; SSE2-NEXT:    leaq (%rbx,%r12,4), %rbx
+; SSE2-NEXT:    movq %rbx, %r12
+; SSE2-NEXT:    andq %r8, %r12
+; SSE2-NEXT:    shrq %rbx
+; SSE2-NEXT:    andq %r8, %rbx
+; SSE2-NEXT:    leaq (%rbx,%r12,2), %rbx
+; SSE2-NEXT:    bswapq %r15
+; SSE2-NEXT:    movq %r15, %r12
+; SSE2-NEXT:    shrq $4, %r12
+; SSE2-NEXT:    andq %rdi, %r12
+; SSE2-NEXT:    andq %rdi, %r15
+; SSE2-NEXT:    shlq $4, %r15
+; SSE2-NEXT:    orq %r12, %r15
+; SSE2-NEXT:    movq %r15, %r12
+; SSE2-NEXT:    andq %rsi, %r12
+; SSE2-NEXT:    shrq $2, %r15
+; SSE2-NEXT:    andq %rsi, %r15
+; SSE2-NEXT:    leaq (%r15,%r12,4), %r15
+; SSE2-NEXT:    movq %r15, %r12
+; SSE2-NEXT:    andq %r8, %r12
+; SSE2-NEXT:    shrq %r15
+; SSE2-NEXT:    andq %r8, %r15
+; SSE2-NEXT:    bswapq %r14
+; SSE2-NEXT:    leaq (%r15,%r12,2), %r15
+; SSE2-NEXT:    movq %r14, %r12
+; SSE2-NEXT:    shrq $4, %r12
+; SSE2-NEXT:    andq %rdi, %r12
+; SSE2-NEXT:    andq %rdi, %r14
+; SSE2-NEXT:    shlq $4, %r14
+; SSE2-NEXT:    orq %r12, %r14
+; SSE2-NEXT:    movq %r14, %r12
+; SSE2-NEXT:    andq %rsi, %r12
+; SSE2-NEXT:    shrq $2, %r14
+; SSE2-NEXT:    andq %rsi, %r14
+; SSE2-NEXT:    leaq (%r14,%r12,4), %r14
+; SSE2-NEXT:    movq %r14, %r12
+; SSE2-NEXT:    andq %r8, %r12
+; SSE2-NEXT:    shrq %r14
+; SSE2-NEXT:    andq %r8, %r14
+; SSE2-NEXT:    leaq (%r14,%r12,2), %r14
+; SSE2-NEXT:    bswapq %r11
+; SSE2-NEXT:    movq %r11, %r12
+; SSE2-NEXT:    shrq $4, %r12
+; SSE2-NEXT:    andq %rdi, %r12
+; SSE2-NEXT:    andq %rdi, %r11
+; SSE2-NEXT:    shlq $4, %r11
+; SSE2-NEXT:    orq %r12, %r11
+; SSE2-NEXT:    movq %r11, %r12
+; SSE2-NEXT:    andq %rsi, %r12
+; SSE2-NEXT:    shrq $2, %r11
+; SSE2-NEXT:    andq %rsi, %r11
+; SSE2-NEXT:    leaq (%r11,%r12,4), %r11
+; SSE2-NEXT:    movq %r11, %r12
+; SSE2-NEXT:    andq %r8, %r12
+; SSE2-NEXT:    shrq %r11
+; SSE2-NEXT:    andq %r8, %r11
+; SSE2-NEXT:    bswapq %r9
+; SSE2-NEXT:    leaq (%r11,%r12,2), %r11
+; SSE2-NEXT:    movq %r9, %r12
+; SSE2-NEXT:    shrq $4, %r12
+; SSE2-NEXT:    andq %rdi, %r12
+; SSE2-NEXT:    andq %rdi, %r9
+; SSE2-NEXT:    shlq $4, %r9
+; SSE2-NEXT:    orq %r12, %r9
+; SSE2-NEXT:    movq %r9, %r12
+; SSE2-NEXT:    andq %rsi, %r12
+; SSE2-NEXT:    shrq $2, %r9
+; SSE2-NEXT:    andq %rsi, %r9
+; SSE2-NEXT:    leaq (%r9,%r12,4), %r9
+; SSE2-NEXT:    movq %r9, %r12
+; SSE2-NEXT:    andq %r8, %r12
+; SSE2-NEXT:    shrq %r9
+; SSE2-NEXT:    andq %r8, %r9
+; SSE2-NEXT:    leaq (%r9,%r12,2), %r9
+; SSE2-NEXT:    bswapq %rcx
+; SSE2-NEXT:    movq %rcx, %r12
+; SSE2-NEXT:    shrq $4, %r12
+; SSE2-NEXT:    andq %rdi, %r12
+; SSE2-NEXT:    andq %rdi, %rcx
+; SSE2-NEXT:    shlq $4, %rcx
+; SSE2-NEXT:    orq %r12, %rcx
+; SSE2-NEXT:    movq %rcx, %rdi
+; SSE2-NEXT:    andq %rsi, %rdi
+; SSE2-NEXT:    shrq $2, %rcx
+; SSE2-NEXT:    andq %rsi, %rcx
+; SSE2-NEXT:    leaq (%rcx,%rdi,4), %rcx
+; SSE2-NEXT:    movq %rcx, %rsi
+; SSE2-NEXT:    andq %r8, %rsi
+; SSE2-NEXT:    shrq %rcx
+; SSE2-NEXT:    andq %r8, %rcx
+; SSE2-NEXT:    leaq (%rcx,%rsi,2), %rcx
+; SSE2-NEXT:    movq %rcx, 56(%rax)
+; SSE2-NEXT:    movq %r9, 40(%rax)
+; SSE2-NEXT:    movq %r11, 24(%rax)
+; SSE2-NEXT:    movq %r14, 8(%rax)
+; SSE2-NEXT:    movq %r15, 48(%rax)
+; SSE2-NEXT:    movq %rbx, 32(%rax)
+; SSE2-NEXT:    movq %r10, 16(%rax)
+; SSE2-NEXT:    movq %rdx, (%rax)
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: bitreverse_i512_vector:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    pushq %r15
+; SSE42-NEXT:    pushq %r14
+; SSE42-NEXT:    pushq %r12
+; SSE42-NEXT:    pushq %rbx
+; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    movq %xmm0, %rcx
+; SSE42-NEXT:    pextrq $1, %xmm0, %r9
+; SSE42-NEXT:    movq %xmm1, %r11
+; SSE42-NEXT:    pextrq $1, %xmm1, %r14
+; SSE42-NEXT:    pextrq $1, %xmm2, %rbx
+; SSE42-NEXT:    movq %xmm2, %r15
+; SSE42-NEXT:    movq %xmm3, %r10
+; SSE42-NEXT:    pextrq $1, %xmm3, %rdx
+; SSE42-NEXT:    bswapq %rdx
+; SSE42-NEXT:    movq %rdx, %rsi
+; SSE42-NEXT:    shrq $4, %rsi
+; SSE42-NEXT:    movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F
+; SSE42-NEXT:    andq %rdi, %rsi
+; SSE42-NEXT:    andq %rdi, %rdx
+; SSE42-NEXT:    shlq $4, %rdx
+; SSE42-NEXT:    orq %rsi, %rdx
+; SSE42-NEXT:    movabsq $3689348814741910323, %rsi # imm = 0x3333333333333333
+; SSE42-NEXT:    movq %rdx, %r8
+; SSE42-NEXT:    andq %rsi, %r8
+; SSE42-NEXT:    shrq $2, %rdx
+; SSE42-NEXT:    andq %rsi, %rdx
+; SSE42-NEXT:    leaq (%rdx,%r8,4), %rdx
+; SSE42-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; SSE42-NEXT:    movq %rdx, %r12
+; SSE42-NEXT:    andq %r8, %r12
+; SSE42-NEXT:    shrq %rdx
+; SSE42-NEXT:    andq %r8, %rdx
+; SSE42-NEXT:    leaq (%rdx,%r12,2), %rdx
+; SSE42-NEXT:    bswapq %r10
+; SSE42-NEXT:    movq %r10, %r12
+; SSE42-NEXT:    shrq $4, %r12
+; SSE42-NEXT:    andq %rdi, %r12
+; SSE42-NEXT:    andq %rdi, %r10
+; SSE42-NEXT:    shlq $4, %r10
+; SSE42-NEXT:    orq %r12, %r10
+; SSE42-NEXT:    movq %r10, %r12
+; SSE42-NEXT:    andq %rsi, %r12
+; SSE42-NEXT:    shrq $2, %r10
+; SSE42-NEXT:    andq %rsi, %r10
+; SSE42-NEXT:    leaq (%r10,%r12,4), %r10
+; SSE42-NEXT:    movq %r10, %r12
+; SSE42-NEXT:    andq %r8, %r12
+; SSE42-NEXT:    shrq %r10
+; SSE42-NEXT:    andq %r8, %r10
+; SSE42-NEXT:    bswapq %rbx
+; SSE42-NEXT:    leaq (%r10,%r12,2), %r10
+; SSE42-NEXT:    movq %rbx, %r12
+; SSE42-NEXT:    shrq $4, %r12
+; SSE42-NEXT:    andq %rdi, %r12
+; SSE42-NEXT:    andq %rdi, %rbx
+; SSE42-NEXT:    shlq $4, %rbx
+; SSE42-NEXT:    orq %r12, %rbx
+; SSE42-NEXT:    movq %rbx, %r12
+; SSE42-NEXT:    andq %rsi, %r12
+; SSE42-NEXT:    shrq $2, %rbx
+; SSE42-NEXT:    andq %rsi, %rbx
+; SSE42-NEXT:    leaq (%rbx,%r12,4), %rbx
+; SSE42-NEXT:    movq %rbx, %r12
+; SSE42-NEXT:    andq %r8, %r12
+; SSE42-NEXT:    shrq %rbx
+; SSE42-NEXT:    andq %r8, %rbx
+; SSE42-NEXT:    leaq (%rbx,%r12,2), %rbx
+; SSE42-NEXT:    bswapq %r15
+; SSE42-NEXT:    movq %r15, %r12
+; SSE42-NEXT:    shrq $4, %r12
+; SSE42-NEXT:    andq %rdi, %r12
+; SSE42-NEXT:    andq %rdi, %r15
+; SSE42-NEXT:    shlq $4, %r15
+; SSE42-NEXT:    orq %r12, %r15
+; SSE42-NEXT:    movq %r15, %r12
+; SSE42-NEXT:    andq %rsi, %r12
+; SSE42-NEXT:    shrq $2, %r15
+; SSE42-NEXT:    andq %rsi, %r15
+; SSE42-NEXT:    leaq (%r15,%r12,4), %r15
+; SSE42-NEXT:    movq %r15, %r12
+; SSE42-NEXT:    andq %r8, %r12
+; SSE42-NEXT:    shrq %r15
+; SSE42-NEXT:    andq %r8, %r15
+; SSE42-NEXT:    bswapq %r14
+; SSE42-NEXT:    leaq (%r15,%r12,2), %r15
+; SSE42-NEXT:    movq %r14, %r12
+; SSE42-NEXT:    shrq $4, %r12
+; SSE42-NEXT:    andq %rdi, %r12
+; SSE42-NEXT:    andq %rdi, %r14
+; SSE42-NEXT:    shlq $4, %r14
+; SSE42-NEXT:    orq %r12, %r14
+; SSE42-NEXT:    movq %r14, %r12
+; SSE42-NEXT:    andq %rsi, %r12
+; SSE42-NEXT:    shrq $2, %r14
+; SSE42-NEXT:    andq %rsi, %r14
+; SSE42-NEXT:    leaq (%r14,%r12,4), %r14
+; SSE42-NEXT:    movq %r14, %r12
+; SSE42-NEXT:    andq %r8, %r12
+; SSE42-NEXT:    shrq %r14
+; SSE42-NEXT:    andq %r8, %r14
+; SSE42-NEXT:    leaq (%r14,%r12,2), %r14
+; SSE42-NEXT:    bswapq %r11
+; SSE42-NEXT:    movq %r11, %r12
+; SSE42-NEXT:    shrq $4, %r12
+; SSE42-NEXT:    andq %rdi, %r12
+; SSE42-NEXT:    andq %rdi, %r11
+; SSE42-NEXT:    shlq $4, %r11
+; SSE42-NEXT:    orq %r12, %r11
+; SSE42-NEXT:    movq %r11, %r12
+; SSE42-NEXT:    andq %rsi, %r12
+; SSE42-NEXT:    shrq $2, %r11
+; SSE42-NEXT:    andq %rsi, %r11
+; SSE42-NEXT:    leaq (%r11,%r12,4), %r11
+; SSE42-NEXT:    movq %r11, %r12
+; SSE42-NEXT:    andq %r8, %r12
+; SSE42-NEXT:    shrq %r11
+; SSE42-NEXT:    andq %r8, %r11
+; SSE42-NEXT:    bswapq %r9
+; SSE42-NEXT:    leaq (%r11,%r12,2), %r11
+; SSE42-NEXT:    movq %r9, %r12
+; SSE42-NEXT:    shrq $4, %r12
+; SSE42-NEXT:    andq %rdi, %r12
+; SSE42-NEXT:    andq %rdi, %r9
+; SSE42-NEXT:    shlq $4, %r9
+; SSE42-NEXT:    orq %r12, %r9
+; SSE42-NEXT:    movq %r9, %r12
+; SSE42-NEXT:    andq %rsi, %r12
+; SSE42-NEXT:    shrq $2, %r9
+; SSE42-NEXT:    andq %rsi, %r9
+; SSE42-NEXT:    leaq (%r9,%r12,4), %r9
+; SSE42-NEXT:    movq %r9, %r12
+; SSE42-NEXT:    andq %r8, %r12
+; SSE42-NEXT:    shrq %r9
+; SSE42-NEXT:    andq %r8, %r9
+; SSE42-NEXT:    leaq (%r9,%r12,2), %r9
+; SSE42-NEXT:    bswapq %rcx
+; SSE42-NEXT:    movq %rcx, %r12
+; SSE42-NEXT:    shrq $4, %r12
+; SSE42-NEXT:    andq %rdi, %r12
+; SSE42-NEXT:    andq %rdi, %rcx
+; SSE42-NEXT:    shlq $4, %rcx
+; SSE42-NEXT:    orq %r12, %rcx
+; SSE42-NEXT:    movq %rcx, %rdi
+; SSE42-NEXT:    andq %rsi, %rdi
+; SSE42-NEXT:    shrq $2, %rcx
+; SSE42-NEXT:    andq %rsi, %rcx
+; SSE42-NEXT:    leaq (%rcx,%rdi,4), %rcx
+; SSE42-NEXT:    movq %rcx, %rsi
+; SSE42-NEXT:    andq %r8, %rsi
+; SSE42-NEXT:    shrq %rcx
+; SSE42-NEXT:    andq %r8, %rcx
+; SSE42-NEXT:    leaq (%rcx,%rsi,2), %rcx
+; SSE42-NEXT:    movq %rcx, 56(%rax)
+; SSE42-NEXT:    movq %r9, 48(%rax)
+; SSE42-NEXT:    movq %r11, 40(%rax)
+; SSE42-NEXT:    movq %r14, 32(%rax)
+; SSE42-NEXT:    movq %r15, 24(%rax)
+; SSE42-NEXT:    movq %rbx, 16(%rax)
+; SSE42-NEXT:    movq %r10, 8(%rax)
+; SSE42-NEXT:    movq %rdx, (%rax)
+; SSE42-NEXT:    popq %rbx
+; SSE42-NEXT:    popq %r12
+; SSE42-NEXT:    popq %r14
+; SSE42-NEXT:    popq %r15
+; SSE42-NEXT:    retq
+;
+; AVX2-LABEL: bitreverse_i512_vector:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    vmovq %xmm0, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %r11
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r14
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX2-NEXT:    vmovq %xmm1, %r15
+; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %r8
+; AVX2-NEXT:    vmovq %xmm0, %r10
+; AVX2-NEXT:    bswapq %r8
+; AVX2-NEXT:    movq %r8, %rdx
+; AVX2-NEXT:    shrq $4, %rdx
+; AVX2-NEXT:    movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F
+; AVX2-NEXT:    andq %rdi, %rdx
+; AVX2-NEXT:    andq %rdi, %r8
+; AVX2-NEXT:    shlq $4, %r8
+; AVX2-NEXT:    orq %rdx, %r8
+; AVX2-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX2-NEXT:    movq %r8, %r9
+; AVX2-NEXT:    andq %rdx, %r9
+; AVX2-NEXT:    shrq $2, %r8
+; AVX2-NEXT:    andq %rdx, %r8
+; AVX2-NEXT:    leaq (%r8,%r9,4), %r9
+; AVX2-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; AVX2-NEXT:    movq %r9, %r12
+; AVX2-NEXT:    andq %r8, %r12
+; AVX2-NEXT:    shrq %r9
+; AVX2-NEXT:    andq %r8, %r9
+; AVX2-NEXT:    bswapq %r10
+; AVX2-NEXT:    leaq (%r9,%r12,2), %r9
+; AVX2-NEXT:    movq %r10, %r12
+; AVX2-NEXT:    shrq $4, %r12
+; AVX2-NEXT:    andq %rdi, %r12
+; AVX2-NEXT:    andq %rdi, %r10
+; AVX2-NEXT:    shlq $4, %r10
+; AVX2-NEXT:    orq %r12, %r10
+; AVX2-NEXT:    movq %r10, %r12
+; AVX2-NEXT:    andq %rdx, %r12
+; AVX2-NEXT:    shrq $2, %r10
+; AVX2-NEXT:    andq %rdx, %r10
+; AVX2-NEXT:    leaq (%r10,%r12,4), %r10
+; AVX2-NEXT:    movq %r10, %r12
+; AVX2-NEXT:    andq %r8, %r12
+; AVX2-NEXT:    shrq %r10
+; AVX2-NEXT:    andq %r8, %r10
+; AVX2-NEXT:    leaq (%r10,%r12,2), %r10
+; AVX2-NEXT:    bswapq %rbx
+; AVX2-NEXT:    movq %rbx, %r12
+; AVX2-NEXT:    shrq $4, %r12
+; AVX2-NEXT:    andq %rdi, %r12
+; AVX2-NEXT:    andq %rdi, %rbx
+; AVX2-NEXT:    shlq $4, %rbx
+; AVX2-NEXT:    orq %r12, %rbx
+; AVX2-NEXT:    movq %rbx, %r12
+; AVX2-NEXT:    andq %rdx, %r12
+; AVX2-NEXT:    shrq $2, %rbx
+; AVX2-NEXT:    andq %rdx, %rbx
+; AVX2-NEXT:    leaq (%rbx,%r12,4), %rbx
+; AVX2-NEXT:    movq %rbx, %r12
+; AVX2-NEXT:    andq %r8, %r12
+; AVX2-NEXT:    shrq %rbx
+; AVX2-NEXT:    andq %r8, %rbx
+; AVX2-NEXT:    leaq (%rbx,%r12,2), %rbx
+; AVX2-NEXT:    bswapq %r15
+; AVX2-NEXT:    movq %r15, %r12
+; AVX2-NEXT:    shrq $4, %r12
+; AVX2-NEXT:    andq %rdi, %r12
+; AVX2-NEXT:    andq %rdi, %r15
+; AVX2-NEXT:    shlq $4, %r15
+; AVX2-NEXT:    orq %r12, %r15
+; AVX2-NEXT:    movq %r15, %r12
+; AVX2-NEXT:    andq %rdx, %r12
+; AVX2-NEXT:    shrq $2, %r15
+; AVX2-NEXT:    andq %rdx, %r15
+; AVX2-NEXT:    leaq (%r15,%r12,4), %r15
+; AVX2-NEXT:    movq %r15, %r12
+; AVX2-NEXT:    andq %r8, %r12
+; AVX2-NEXT:    shrq %r15
+; AVX2-NEXT:    andq %r8, %r15
+; AVX2-NEXT:    leaq (%r15,%r12,2), %r15
+; AVX2-NEXT:    bswapq %r14
+; AVX2-NEXT:    movq %r14, %r12
+; AVX2-NEXT:    shrq $4, %r12
+; AVX2-NEXT:    andq %rdi, %r12
+; AVX2-NEXT:    andq %rdi, %r14
+; AVX2-NEXT:    shlq $4, %r14
+; AVX2-NEXT:    orq %r12, %r14
+; AVX2-NEXT:    movq %r14, %r12
+; AVX2-NEXT:    andq %rdx, %r12
+; AVX2-NEXT:    shrq $2, %r14
+; AVX2-NEXT:    andq %rdx, %r14
+; AVX2-NEXT:    leaq (%r14,%r12,4), %r14
+; AVX2-NEXT:    movq %r14, %r12
+; AVX2-NEXT:    andq %r8, %r12
+; AVX2-NEXT:    shrq %r14
+; AVX2-NEXT:    andq %r8, %r14
+; AVX2-NEXT:    bswapq %r11
+; AVX2-NEXT:    leaq (%r14,%r12,2), %r14
+; AVX2-NEXT:    movq %r11, %r12
+; AVX2-NEXT:    shrq $4, %r12
+; AVX2-NEXT:    andq %rdi, %r12
+; AVX2-NEXT:    andq %rdi, %r11
+; AVX2-NEXT:    shlq $4, %r11
+; AVX2-NEXT:    orq %r12, %r11
+; AVX2-NEXT:    movq %r11, %r12
+; AVX2-NEXT:    andq %rdx, %r12
+; AVX2-NEXT:    shrq $2, %r11
+; AVX2-NEXT:    andq %rdx, %r11
+; AVX2-NEXT:    leaq (%r11,%r12,4), %r11
+; AVX2-NEXT:    movq %r11, %r12
+; AVX2-NEXT:    andq %r8, %r12
+; AVX2-NEXT:    shrq %r11
+; AVX2-NEXT:    andq %r8, %r11
+; AVX2-NEXT:    leaq (%r11,%r12,2), %r11
+; AVX2-NEXT:    bswapq %rsi
+; AVX2-NEXT:    movq %rsi, %r12
+; AVX2-NEXT:    shrq $4, %r12
+; AVX2-NEXT:    andq %rdi, %r12
+; AVX2-NEXT:    andq %rdi, %rsi
+; AVX2-NEXT:    shlq $4, %rsi
+; AVX2-NEXT:    orq %r12, %rsi
+; AVX2-NEXT:    movq %rsi, %r12
+; AVX2-NEXT:    andq %rdx, %r12
+; AVX2-NEXT:    shrq $2, %rsi
+; AVX2-NEXT:    andq %rdx, %rsi
+; AVX2-NEXT:    leaq (%rsi,%r12,4), %rsi
+; AVX2-NEXT:    movq %rsi, %r12
+; AVX2-NEXT:    andq %r8, %r12
+; AVX2-NEXT:    shrq %rsi
+; AVX2-NEXT:    andq %r8, %rsi
+; AVX2-NEXT:    leaq (%rsi,%r12,2), %rsi
+; AVX2-NEXT:    bswapq %rcx
+; AVX2-NEXT:    movq %rcx, %r12
+; AVX2-NEXT:    shrq $4, %r12
+; AVX2-NEXT:    andq %rdi, %r12
+; AVX2-NEXT:    andq %rdi, %rcx
+; AVX2-NEXT:    shlq $4, %rcx
+; AVX2-NEXT:    orq %r12, %rcx
+; AVX2-NEXT:    movq %rcx, %rdi
+; AVX2-NEXT:    andq %rdx, %rdi
+; AVX2-NEXT:    shrq $2, %rcx
+; AVX2-NEXT:    andq %rdx, %rcx
+; AVX2-NEXT:    leaq (%rcx,%rdi,4), %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    andq %r8, %rdx
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andq %r8, %rcx
+; AVX2-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX2-NEXT:    movq %rcx, 56(%rax)
+; AVX2-NEXT:    movq %rsi, 48(%rax)
+; AVX2-NEXT:    movq %r11, 40(%rax)
+; AVX2-NEXT:    movq %r14, 32(%rax)
+; AVX2-NEXT:    movq %r15, 24(%rax)
+; AVX2-NEXT:    movq %rbx, 16(%rax)
+; AVX2-NEXT:    movq %r10, 8(%rax)
+; AVX2-NEXT:    movq %r9, (%rax)
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitreverse_i512_vector:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %r15
+; AVX512F-NEXT:    pushq %r14
+; AVX512F-NEXT:    pushq %r12
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512F-NEXT:    vmovq %xmm0, %rcx
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX512F-NEXT:    vmovq %xmm1, %r11
+; AVX512F-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512F-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX512F-NEXT:    vmovq %xmm1, %r15
+; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vpextrq $1, %xmm0, %r8
+; AVX512F-NEXT:    vmovq %xmm0, %r10
+; AVX512F-NEXT:    bswapq %r8
+; AVX512F-NEXT:    movq %r8, %rsi
+; AVX512F-NEXT:    shrq $4, %rsi
+; AVX512F-NEXT:    movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F
+; AVX512F-NEXT:    andq %rdi, %rsi
+; AVX512F-NEXT:    andq %rdi, %r8
+; AVX512F-NEXT:    shlq $4, %r8
+; AVX512F-NEXT:    orq %rsi, %r8
+; AVX512F-NEXT:    movabsq $3689348814741910323, %rsi # imm = 0x3333333333333333
+; AVX512F-NEXT:    movq %r8, %r9
+; AVX512F-NEXT:    andq %rsi, %r9
+; AVX512F-NEXT:    shrq $2, %r8
+; AVX512F-NEXT:    andq %rsi, %r8
+; AVX512F-NEXT:    leaq (%r8,%r9,4), %r9
+; AVX512F-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; AVX512F-NEXT:    movq %r9, %r12
+; AVX512F-NEXT:    andq %r8, %r12
+; AVX512F-NEXT:    shrq %r9
+; AVX512F-NEXT:    andq %r8, %r9
+; AVX512F-NEXT:    bswapq %r10
+; AVX512F-NEXT:    leaq (%r9,%r12,2), %r9
+; AVX512F-NEXT:    movq %r10, %r12
+; AVX512F-NEXT:    shrq $4, %r12
+; AVX512F-NEXT:    andq %rdi, %r12
+; AVX512F-NEXT:    andq %rdi, %r10
+; AVX512F-NEXT:    shlq $4, %r10
+; AVX512F-NEXT:    orq %r12, %r10
+; AVX512F-NEXT:    movq %r10, %r12
+; AVX512F-NEXT:    andq %rsi, %r12
+; AVX512F-NEXT:    shrq $2, %r10
+; AVX512F-NEXT:    andq %rsi, %r10
+; AVX512F-NEXT:    leaq (%r10,%r12,4), %r10
+; AVX512F-NEXT:    movq %r10, %r12
+; AVX512F-NEXT:    andq %r8, %r12
+; AVX512F-NEXT:    shrq %r10
+; AVX512F-NEXT:    andq %r8, %r10
+; AVX512F-NEXT:    leaq (%r10,%r12,2), %r10
+; AVX512F-NEXT:    bswapq %rbx
+; AVX512F-NEXT:    movq %rbx, %r12
+; AVX512F-NEXT:    shrq $4, %r12
+; AVX512F-NEXT:    andq %rdi, %r12
+; AVX512F-NEXT:    andq %rdi, %rbx
+; AVX512F-NEXT:    shlq $4, %rbx
+; AVX512F-NEXT:    orq %r12, %rbx
+; AVX512F-NEXT:    movq %rbx, %r12
+; AVX512F-NEXT:    andq %rsi, %r12
+; AVX512F-NEXT:    shrq $2, %rbx
+; AVX512F-NEXT:    andq %rsi, %rbx
+; AVX512F-NEXT:    leaq (%rbx,%r12,4), %rbx
+; AVX512F-NEXT:    movq %rbx, %r12
+; AVX512F-NEXT:    andq %r8, %r12
+; AVX512F-NEXT:    shrq %rbx
+; AVX512F-NEXT:    andq %r8, %rbx
+; AVX512F-NEXT:    leaq (%rbx,%r12,2), %rbx
+; AVX512F-NEXT:    bswapq %r15
+; AVX512F-NEXT:    movq %r15, %r12
+; AVX512F-NEXT:    shrq $4, %r12
+; AVX512F-NEXT:    andq %rdi, %r12
+; AVX512F-NEXT:    andq %rdi, %r15
+; AVX512F-NEXT:    shlq $4, %r15
+; AVX512F-NEXT:    orq %r12, %r15
+; AVX512F-NEXT:    movq %r15, %r12
+; AVX512F-NEXT:    andq %rsi, %r12
+; AVX512F-NEXT:    shrq $2, %r15
+; AVX512F-NEXT:    andq %rsi, %r15
+; AVX512F-NEXT:    leaq (%r15,%r12,4), %r15
+; AVX512F-NEXT:    movq %r15, %r12
+; AVX512F-NEXT:    andq %r8, %r12
+; AVX512F-NEXT:    shrq %r15
+; AVX512F-NEXT:    andq %r8, %r15
+; AVX512F-NEXT:    leaq (%r15,%r12,2), %r15
+; AVX512F-NEXT:    bswapq %r14
+; AVX512F-NEXT:    movq %r14, %r12
+; AVX512F-NEXT:    shrq $4, %r12
+; AVX512F-NEXT:    andq %rdi, %r12
+; AVX512F-NEXT:    andq %rdi, %r14
+; AVX512F-NEXT:    shlq $4, %r14
+; AVX512F-NEXT:    orq %r12, %r14
+; AVX512F-NEXT:    movq %r14, %r12
+; AVX512F-NEXT:    andq %rsi, %r12
+; AVX512F-NEXT:    shrq $2, %r14
+; AVX512F-NEXT:    andq %rsi, %r14
+; AVX512F-NEXT:    leaq (%r14,%r12,4), %r14
+; AVX512F-NEXT:    movq %r14, %r12
+; AVX512F-NEXT:    andq %r8, %r12
+; AVX512F-NEXT:    shrq %r14
+; AVX512F-NEXT:    andq %r8, %r14
+; AVX512F-NEXT:    bswapq %r11
+; AVX512F-NEXT:    leaq (%r14,%r12,2), %r14
+; AVX512F-NEXT:    movq %r11, %r12
+; AVX512F-NEXT:    shrq $4, %r12
+; AVX512F-NEXT:    andq %rdi, %r12
+; AVX512F-NEXT:    andq %rdi, %r11
+; AVX512F-NEXT:    shlq $4, %r11
+; AVX512F-NEXT:    orq %r12, %r11
+; AVX512F-NEXT:    movq %r11, %r12
+; AVX512F-NEXT:    andq %rsi, %r12
+; AVX512F-NEXT:    shrq $2, %r11
+; AVX512F-NEXT:    andq %rsi, %r11
+; AVX512F-NEXT:    leaq (%r11,%r12,4), %r11
+; AVX512F-NEXT:    movq %r11, %r12
+; AVX512F-NEXT:    andq %r8, %r12
+; AVX512F-NEXT:    shrq %r11
+; AVX512F-NEXT:    andq %r8, %r11
+; AVX512F-NEXT:    leaq (%r11,%r12,2), %r11
+; AVX512F-NEXT:    bswapq %rdx
+; AVX512F-NEXT:    movq %rdx, %r12
+; AVX512F-NEXT:    shrq $4, %r12
+; AVX512F-NEXT:    andq %rdi, %r12
+; AVX512F-NEXT:    andq %rdi, %rdx
+; AVX512F-NEXT:    shlq $4, %rdx
+; AVX512F-NEXT:    orq %r12, %rdx
+; AVX512F-NEXT:    movq %rdx, %r12
+; AVX512F-NEXT:    andq %rsi, %r12
+; AVX512F-NEXT:    shrq $2, %rdx
+; AVX512F-NEXT:    andq %rsi, %rdx
+; AVX512F-NEXT:    leaq (%rdx,%r12,4), %rdx
+; AVX512F-NEXT:    movq %rdx, %r12
+; AVX512F-NEXT:    andq %r8, %r12
+; AVX512F-NEXT:    shrq %rdx
+; AVX512F-NEXT:    andq %r8, %rdx
+; AVX512F-NEXT:    leaq (%rdx,%r12,2), %rdx
+; AVX512F-NEXT:    bswapq %rcx
+; AVX512F-NEXT:    movq %rcx, %r12
+; AVX512F-NEXT:    shrq $4, %r12
+; AVX512F-NEXT:    andq %rdi, %r12
+; AVX512F-NEXT:    andq %rdi, %rcx
+; AVX512F-NEXT:    shlq $4, %rcx
+; AVX512F-NEXT:    orq %r12, %rcx
+; AVX512F-NEXT:    movq %rcx, %rdi
+; AVX512F-NEXT:    andq %rsi, %rdi
+; AVX512F-NEXT:    shrq $2, %rcx
+; AVX512F-NEXT:    andq %rsi, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%rdi,4), %rcx
+; AVX512F-NEXT:    movq %rcx, %rsi
+; AVX512F-NEXT:    andq %r8, %rsi
+; AVX512F-NEXT:    shrq %rcx
+; AVX512F-NEXT:    andq %r8, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%rsi,2), %rcx
+; AVX512F-NEXT:    movq %rcx, 56(%rax)
+; AVX512F-NEXT:    movq %rdx, 48(%rax)
+; AVX512F-NEXT:    movq %r11, 24(%rax)
+; AVX512F-NEXT:    movq %r14, 16(%rax)
+; AVX512F-NEXT:    movq %r15, 8(%rax)
+; AVX512F-NEXT:    movq %rbx, (%rax)
+; AVX512F-NEXT:    movq %r10, 40(%rax)
+; AVX512F-NEXT:    movq %r9, 32(%rax)
+; AVX512F-NEXT:    popq %rbx
+; AVX512F-NEXT:    popq %r12
+; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: bitreverse_i512_vector:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %r15
+; AVX512VL-NEXT:    pushq %r14
+; AVX512VL-NEXT:    pushq %r12
+; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VL-NEXT:    vmovq %xmm0, %rcx
+; AVX512VL-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm1, %r11
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX512VL-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512VL-NEXT:    vmovq %xmm1, %r15
+; AVX512VL-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VL-NEXT:    vmovq %xmm0, %r10
+; AVX512VL-NEXT:    bswapq %rdx
+; AVX512VL-NEXT:    movq %rdx, %rdi
+; AVX512VL-NEXT:    shrq $4, %rdi
+; AVX512VL-NEXT:    movabsq $1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VL-NEXT:    andq %r8, %rdi
+; AVX512VL-NEXT:    andq %r8, %rdx
+; AVX512VL-NEXT:    shlq $4, %rdx
+; AVX512VL-NEXT:    orq %rdi, %rdx
+; AVX512VL-NEXT:    movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; AVX512VL-NEXT:    movq %rdx, %r9
+; AVX512VL-NEXT:    andq %rdi, %r9
+; AVX512VL-NEXT:    shrq $2, %rdx
+; AVX512VL-NEXT:    andq %rdi, %rdx
+; AVX512VL-NEXT:    leaq (%rdx,%r9,4), %rdx
+; AVX512VL-NEXT:    movabsq $6148914691236517205, %r9 # imm = 0x5555555555555555
+; AVX512VL-NEXT:    movq %rdx, %r12
+; AVX512VL-NEXT:    andq %r9, %r12
+; AVX512VL-NEXT:    shrq %rdx
+; AVX512VL-NEXT:    andq %r9, %rdx
+; AVX512VL-NEXT:    leaq (%rdx,%r12,2), %rdx
+; AVX512VL-NEXT:    bswapq %r10
+; AVX512VL-NEXT:    movq %r10, %r12
+; AVX512VL-NEXT:    shrq $4, %r12
+; AVX512VL-NEXT:    andq %r8, %r12
+; AVX512VL-NEXT:    andq %r8, %r10
+; AVX512VL-NEXT:    shlq $4, %r10
+; AVX512VL-NEXT:    orq %r12, %r10
+; AVX512VL-NEXT:    movq %r10, %r12
+; AVX512VL-NEXT:    andq %rdi, %r12
+; AVX512VL-NEXT:    shrq $2, %r10
+; AVX512VL-NEXT:    andq %rdi, %r10
+; AVX512VL-NEXT:    leaq (%r10,%r12,4), %r10
+; AVX512VL-NEXT:    movq %r10, %r12
+; AVX512VL-NEXT:    andq %r9, %r12
+; AVX512VL-NEXT:    shrq %r10
+; AVX512VL-NEXT:    andq %r9, %r10
+; AVX512VL-NEXT:    leaq (%r10,%r12,2), %r10
+; AVX512VL-NEXT:    bswapq %rbx
+; AVX512VL-NEXT:    movq %rbx, %r12
+; AVX512VL-NEXT:    shrq $4, %r12
+; AVX512VL-NEXT:    andq %r8, %r12
+; AVX512VL-NEXT:    andq %r8, %rbx
+; AVX512VL-NEXT:    shlq $4, %rbx
+; AVX512VL-NEXT:    orq %r12, %rbx
+; AVX512VL-NEXT:    movq %rbx, %r12
+; AVX512VL-NEXT:    andq %rdi, %r12
+; AVX512VL-NEXT:    shrq $2, %rbx
+; AVX512VL-NEXT:    andq %rdi, %rbx
+; AVX512VL-NEXT:    leaq (%rbx,%r12,4), %rbx
+; AVX512VL-NEXT:    movq %rbx, %r12
+; AVX512VL-NEXT:    andq %r9, %r12
+; AVX512VL-NEXT:    shrq %rbx
+; AVX512VL-NEXT:    andq %r9, %rbx
+; AVX512VL-NEXT:    leaq (%rbx,%r12,2), %rbx
+; AVX512VL-NEXT:    bswapq %r15
+; AVX512VL-NEXT:    movq %r15, %r12
+; AVX512VL-NEXT:    shrq $4, %r12
+; AVX512VL-NEXT:    andq %r8, %r12
+; AVX512VL-NEXT:    andq %r8, %r15
+; AVX512VL-NEXT:    shlq $4, %r15
+; AVX512VL-NEXT:    orq %r12, %r15
+; AVX512VL-NEXT:    movq %r15, %r12
+; AVX512VL-NEXT:    andq %rdi, %r12
+; AVX512VL-NEXT:    shrq $2, %r15
+; AVX512VL-NEXT:    andq %rdi, %r15
+; AVX512VL-NEXT:    leaq (%r15,%r12,4), %r15
+; AVX512VL-NEXT:    movq %r15, %r12
+; AVX512VL-NEXT:    andq %r9, %r12
+; AVX512VL-NEXT:    shrq %r15
+; AVX512VL-NEXT:    andq %r9, %r15
+; AVX512VL-NEXT:    leaq (%r15,%r12,2), %r15
+; AVX512VL-NEXT:    bswapq %r14
+; AVX512VL-NEXT:    movq %r14, %r12
+; AVX512VL-NEXT:    shrq $4, %r12
+; AVX512VL-NEXT:    andq %r8, %r12
+; AVX512VL-NEXT:    andq %r8, %r14
+; AVX512VL-NEXT:    shlq $4, %r14
+; AVX512VL-NEXT:    orq %r12, %r14
+; AVX512VL-NEXT:    movq %r14, %r12
+; AVX512VL-NEXT:    andq %rdi, %r12
+; AVX512VL-NEXT:    shrq $2, %r14
+; AVX512VL-NEXT:    andq %rdi, %r14
+; AVX512VL-NEXT:    leaq (%r14,%r12,4), %r14
+; AVX512VL-NEXT:    movq %r14, %r12
+; AVX512VL-NEXT:    andq %r9, %r12
+; AVX512VL-NEXT:    shrq %r14
+; AVX512VL-NEXT:    andq %r9, %r14
+; AVX512VL-NEXT:    leaq (%r14,%r12,2), %r14
+; AVX512VL-NEXT:    bswapq %r11
+; AVX512VL-NEXT:    movq %r11, %r12
+; AVX512VL-NEXT:    shrq $4, %r12
+; AVX512VL-NEXT:    andq %r8, %r12
+; AVX512VL-NEXT:    andq %r8, %r11
+; AVX512VL-NEXT:    shlq $4, %r11
+; AVX512VL-NEXT:    orq %r12, %r11
+; AVX512VL-NEXT:    movq %r11, %r12
+; AVX512VL-NEXT:    andq %rdi, %r12
+; AVX512VL-NEXT:    shrq $2, %r11
+; AVX512VL-NEXT:    andq %rdi, %r11
+; AVX512VL-NEXT:    leaq (%r11,%r12,4), %r11
+; AVX512VL-NEXT:    movq %r11, %r12
+; AVX512VL-NEXT:    andq %r9, %r12
+; AVX512VL-NEXT:    shrq %r11
+; AVX512VL-NEXT:    andq %r9, %r11
+; AVX512VL-NEXT:    leaq (%r11,%r12,2), %r11
+; AVX512VL-NEXT:    bswapq %rsi
+; AVX512VL-NEXT:    movq %rsi, %r12
+; AVX512VL-NEXT:    shrq $4, %r12
+; AVX512VL-NEXT:    andq %r8, %r12
+; AVX512VL-NEXT:    andq %r8, %rsi
+; AVX512VL-NEXT:    shlq $4, %rsi
+; AVX512VL-NEXT:    orq %r12, %rsi
+; AVX512VL-NEXT:    movq %rsi, %r12
+; AVX512VL-NEXT:    andq %rdi, %r12
+; AVX512VL-NEXT:    shrq $2, %rsi
+; AVX512VL-NEXT:    andq %rdi, %rsi
+; AVX512VL-NEXT:    leaq (%rsi,%r12,4), %rsi
+; AVX512VL-NEXT:    movq %rsi, %r12
+; AVX512VL-NEXT:    andq %r9, %r12
+; AVX512VL-NEXT:    shrq %rsi
+; AVX512VL-NEXT:    andq %r9, %rsi
+; AVX512VL-NEXT:    leaq (%rsi,%r12,2), %rsi
+; AVX512VL-NEXT:    bswapq %rcx
+; AVX512VL-NEXT:    movq %rcx, %r12
+; AVX512VL-NEXT:    shrq $4, %r12
+; AVX512VL-NEXT:    andq %r8, %r12
+; AVX512VL-NEXT:    andq %r8, %rcx
+; AVX512VL-NEXT:    shlq $4, %rcx
+; AVX512VL-NEXT:    orq %r12, %rcx
+; AVX512VL-NEXT:    movq %rcx, %r8
+; AVX512VL-NEXT:    andq %rdi, %r8
+; AVX512VL-NEXT:    shrq $2, %rcx
+; AVX512VL-NEXT:    andq %rdi, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%r8,4), %rcx
+; AVX512VL-NEXT:    movq %rcx, %rdi
+; AVX512VL-NEXT:    andq %r9, %rdi
+; AVX512VL-NEXT:    shrq %rcx
+; AVX512VL-NEXT:    andq %r9, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%rdi,2), %rcx
+; AVX512VL-NEXT:    movq %rcx, 56(%rax)
+; AVX512VL-NEXT:    movq %rsi, 48(%rax)
+; AVX512VL-NEXT:    movq %r11, 24(%rax)
+; AVX512VL-NEXT:    movq %r14, 16(%rax)
+; AVX512VL-NEXT:    movq %r15, 8(%rax)
+; AVX512VL-NEXT:    movq %rbx, (%rax)
+; AVX512VL-NEXT:    movq %r10, 40(%rax)
+; AVX512VL-NEXT:    movq %rdx, 32(%rax)
+; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    popq %r12
+; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bitreverse_i512_vector:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %r15
+; AVX512VBMI-NEXT:    pushq %r14
+; AVX512VBMI-NEXT:    pushq %r12
+; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %rsi
+; AVX512VBMI-NEXT:    vmovq %xmm0, %rcx
+; AVX512VBMI-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
+; AVX512VBMI-NEXT:    vmovq %xmm1, %r11
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %r14
+; AVX512VBMI-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
+; AVX512VBMI-NEXT:    vmovq %xmm1, %r15
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm1, %rbx
+; AVX512VBMI-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VBMI-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512VBMI-NEXT:    vmovq %xmm0, %r10
+; AVX512VBMI-NEXT:    bswapq %rdx
+; AVX512VBMI-NEXT:    movq %rdx, %rdi
+; AVX512VBMI-NEXT:    shrq $4, %rdi
+; AVX512VBMI-NEXT:    movabsq $1085102592571150095, %r8 # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VBMI-NEXT:    andq %r8, %rdi
+; AVX512VBMI-NEXT:    andq %r8, %rdx
+; AVX512VBMI-NEXT:    shlq $4, %rdx
+; AVX512VBMI-NEXT:    orq %rdi, %rdx
+; AVX512VBMI-NEXT:    movabsq $3689348814741910323, %rdi # imm = 0x3333333333333333
+; AVX512VBMI-NEXT:    movq %rdx, %r9
+; AVX512VBMI-NEXT:    andq %rdi, %r9
+; AVX512VBMI-NEXT:    shrq $2, %rdx
+; AVX512VBMI-NEXT:    andq %rdi, %rdx
+; AVX512VBMI-NEXT:    leaq (%rdx,%r9,4), %rdx
+; AVX512VBMI-NEXT:    movabsq $6148914691236517205, %r9 # imm = 0x5555555555555555
+; AVX512VBMI-NEXT:    movq %rdx, %r12
+; AVX512VBMI-NEXT:    andq %r9, %r12
+; AVX512VBMI-NEXT:    shrq %rdx
+; AVX512VBMI-NEXT:    andq %r9, %rdx
+; AVX512VBMI-NEXT:    leaq (%rdx,%r12,2), %rdx
+; AVX512VBMI-NEXT:    bswapq %r10
+; AVX512VBMI-NEXT:    movq %r10, %r12
+; AVX512VBMI-NEXT:    shrq $4, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r10
+; AVX512VBMI-NEXT:    shlq $4, %r10
+; AVX512VBMI-NEXT:    orq %r12, %r10
+; AVX512VBMI-NEXT:    movq %r10, %r12
+; AVX512VBMI-NEXT:    andq %rdi, %r12
+; AVX512VBMI-NEXT:    shrq $2, %r10
+; AVX512VBMI-NEXT:    andq %rdi, %r10
+; AVX512VBMI-NEXT:    leaq (%r10,%r12,4), %r10
+; AVX512VBMI-NEXT:    movq %r10, %r12
+; AVX512VBMI-NEXT:    andq %r9, %r12
+; AVX512VBMI-NEXT:    shrq %r10
+; AVX512VBMI-NEXT:    andq %r9, %r10
+; AVX512VBMI-NEXT:    leaq (%r10,%r12,2), %r10
+; AVX512VBMI-NEXT:    bswapq %rbx
+; AVX512VBMI-NEXT:    movq %rbx, %r12
+; AVX512VBMI-NEXT:    shrq $4, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r12
+; AVX512VBMI-NEXT:    andq %r8, %rbx
+; AVX512VBMI-NEXT:    shlq $4, %rbx
+; AVX512VBMI-NEXT:    orq %r12, %rbx
+; AVX512VBMI-NEXT:    movq %rbx, %r12
+; AVX512VBMI-NEXT:    andq %rdi, %r12
+; AVX512VBMI-NEXT:    shrq $2, %rbx
+; AVX512VBMI-NEXT:    andq %rdi, %rbx
+; AVX512VBMI-NEXT:    leaq (%rbx,%r12,4), %rbx
+; AVX512VBMI-NEXT:    movq %rbx, %r12
+; AVX512VBMI-NEXT:    andq %r9, %r12
+; AVX512VBMI-NEXT:    shrq %rbx
+; AVX512VBMI-NEXT:    andq %r9, %rbx
+; AVX512VBMI-NEXT:    leaq (%rbx,%r12,2), %rbx
+; AVX512VBMI-NEXT:    bswapq %r15
+; AVX512VBMI-NEXT:    movq %r15, %r12
+; AVX512VBMI-NEXT:    shrq $4, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r15
+; AVX512VBMI-NEXT:    shlq $4, %r15
+; AVX512VBMI-NEXT:    orq %r12, %r15
+; AVX512VBMI-NEXT:    movq %r15, %r12
+; AVX512VBMI-NEXT:    andq %rdi, %r12
+; AVX512VBMI-NEXT:    shrq $2, %r15
+; AVX512VBMI-NEXT:    andq %rdi, %r15
+; AVX512VBMI-NEXT:    leaq (%r15,%r12,4), %r15
+; AVX512VBMI-NEXT:    movq %r15, %r12
+; AVX512VBMI-NEXT:    andq %r9, %r12
+; AVX512VBMI-NEXT:    shrq %r15
+; AVX512VBMI-NEXT:    andq %r9, %r15
+; AVX512VBMI-NEXT:    leaq (%r15,%r12,2), %r15
+; AVX512VBMI-NEXT:    bswapq %r14
+; AVX512VBMI-NEXT:    movq %r14, %r12
+; AVX512VBMI-NEXT:    shrq $4, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r14
+; AVX512VBMI-NEXT:    shlq $4, %r14
+; AVX512VBMI-NEXT:    orq %r12, %r14
+; AVX512VBMI-NEXT:    movq %r14, %r12
+; AVX512VBMI-NEXT:    andq %rdi, %r12
+; AVX512VBMI-NEXT:    shrq $2, %r14
+; AVX512VBMI-NEXT:    andq %rdi, %r14
+; AVX512VBMI-NEXT:    leaq (%r14,%r12,4), %r14
+; AVX512VBMI-NEXT:    movq %r14, %r12
+; AVX512VBMI-NEXT:    andq %r9, %r12
+; AVX512VBMI-NEXT:    shrq %r14
+; AVX512VBMI-NEXT:    andq %r9, %r14
+; AVX512VBMI-NEXT:    leaq (%r14,%r12,2), %r14
+; AVX512VBMI-NEXT:    bswapq %r11
+; AVX512VBMI-NEXT:    movq %r11, %r12
+; AVX512VBMI-NEXT:    shrq $4, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r11
+; AVX512VBMI-NEXT:    shlq $4, %r11
+; AVX512VBMI-NEXT:    orq %r12, %r11
+; AVX512VBMI-NEXT:    movq %r11, %r12
+; AVX512VBMI-NEXT:    andq %rdi, %r12
+; AVX512VBMI-NEXT:    shrq $2, %r11
+; AVX512VBMI-NEXT:    andq %rdi, %r11
+; AVX512VBMI-NEXT:    leaq (%r11,%r12,4), %r11
+; AVX512VBMI-NEXT:    movq %r11, %r12
+; AVX512VBMI-NEXT:    andq %r9, %r12
+; AVX512VBMI-NEXT:    shrq %r11
+; AVX512VBMI-NEXT:    andq %r9, %r11
+; AVX512VBMI-NEXT:    leaq (%r11,%r12,2), %r11
+; AVX512VBMI-NEXT:    bswapq %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %r12
+; AVX512VBMI-NEXT:    shrq $4, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r12
+; AVX512VBMI-NEXT:    andq %r8, %rsi
+; AVX512VBMI-NEXT:    shlq $4, %rsi
+; AVX512VBMI-NEXT:    orq %r12, %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %r12
+; AVX512VBMI-NEXT:    andq %rdi, %r12
+; AVX512VBMI-NEXT:    shrq $2, %rsi
+; AVX512VBMI-NEXT:    andq %rdi, %rsi
+; AVX512VBMI-NEXT:    leaq (%rsi,%r12,4), %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %r12
+; AVX512VBMI-NEXT:    andq %r9, %r12
+; AVX512VBMI-NEXT:    shrq %rsi
+; AVX512VBMI-NEXT:    andq %r9, %rsi
+; AVX512VBMI-NEXT:    leaq (%rsi,%r12,2), %rsi
+; AVX512VBMI-NEXT:    bswapq %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %r12
+; AVX512VBMI-NEXT:    shrq $4, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r12
+; AVX512VBMI-NEXT:    andq %r8, %rcx
+; AVX512VBMI-NEXT:    shlq $4, %rcx
+; AVX512VBMI-NEXT:    orq %r12, %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %r8
+; AVX512VBMI-NEXT:    andq %rdi, %r8
+; AVX512VBMI-NEXT:    shrq $2, %rcx
+; AVX512VBMI-NEXT:    andq %rdi, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%r8,4), %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %rdi
+; AVX512VBMI-NEXT:    andq %r9, %rdi
+; AVX512VBMI-NEXT:    shrq %rcx
+; AVX512VBMI-NEXT:    andq %r9, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%rdi,2), %rcx
+; AVX512VBMI-NEXT:    movq %rcx, 56(%rax)
+; AVX512VBMI-NEXT:    movq %rsi, 48(%rax)
+; AVX512VBMI-NEXT:    movq %r11, 24(%rax)
+; AVX512VBMI-NEXT:    movq %r14, 16(%rax)
+; AVX512VBMI-NEXT:    movq %r15, 8(%rax)
+; AVX512VBMI-NEXT:    movq %rbx, (%rax)
+; AVX512VBMI-NEXT:    movq %r10, 40(%rax)
+; AVX512VBMI-NEXT:    movq %rdx, 32(%rax)
+; AVX512VBMI-NEXT:    popq %rbx
+; AVX512VBMI-NEXT:    popq %r12
+; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
+  %a0 = bitcast <8 x i64> %v0 to i512
+  %res = call i512 @llvm.bitreverse.i512(i512 %a0)
+  ret i512 %res
+}
+
+define i512 @bitreverse_i512_load(ptr %p0) nounwind {
+; SSE-LABEL: bitreverse_i512_load:
+; SSE:       # %bb.0:
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    movq %rdi, %rax
+; SSE-NEXT:    movq (%rsi), %rcx
+; SSE-NEXT:    movq 8(%rsi), %r9
+; SSE-NEXT:    movq 16(%rsi), %r11
+; SSE-NEXT:    movq 24(%rsi), %r14
+; SSE-NEXT:    movq 32(%rsi), %r15
+; SSE-NEXT:    movq 40(%rsi), %rbx
+; SSE-NEXT:    movq 48(%rsi), %r10
+; SSE-NEXT:    movq 56(%rsi), %rdx
+; SSE-NEXT:    bswapq %rdx
+; SSE-NEXT:    movq %rdx, %rsi
+; SSE-NEXT:    shrq $4, %rsi
+; SSE-NEXT:    movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F
+; SSE-NEXT:    andq %rdi, %rsi
+; SSE-NEXT:    andq %rdi, %rdx
+; SSE-NEXT:    shlq $4, %rdx
+; SSE-NEXT:    orq %rsi, %rdx
+; SSE-NEXT:    movabsq $3689348814741910323, %rsi # imm = 0x3333333333333333
+; SSE-NEXT:    movq %rdx, %r8
+; SSE-NEXT:    andq %rsi, %r8
+; SSE-NEXT:    shrq $2, %rdx
+; SSE-NEXT:    andq %rsi, %rdx
+; SSE-NEXT:    leaq (%rdx,%r8,4), %rdx
+; SSE-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; SSE-NEXT:    movq %rdx, %r12
+; SSE-NEXT:    andq %r8, %r12
+; SSE-NEXT:    shrq %rdx
+; SSE-NEXT:    andq %r8, %rdx
+; SSE-NEXT:    leaq (%rdx,%r12,2), %rdx
+; SSE-NEXT:    bswapq %r10
+; SSE-NEXT:    movq %r10, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %rdi, %r12
+; SSE-NEXT:    andq %rdi, %r10
+; SSE-NEXT:    shlq $4, %r10
+; SSE-NEXT:    orq %r12, %r10
+; SSE-NEXT:    movq %r10, %r12
+; SSE-NEXT:    andq %rsi, %r12
+; SSE-NEXT:    shrq $2, %r10
+; SSE-NEXT:    andq %rsi, %r10
+; SSE-NEXT:    leaq (%r10,%r12,4), %r10
+; SSE-NEXT:    movq %r10, %r12
+; SSE-NEXT:    andq %r8, %r12
+; SSE-NEXT:    shrq %r10
+; SSE-NEXT:    andq %r8, %r10
+; SSE-NEXT:    bswapq %rbx
+; SSE-NEXT:    leaq (%r10,%r12,2), %r10
+; SSE-NEXT:    movq %rbx, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %rdi, %r12
+; SSE-NEXT:    andq %rdi, %rbx
+; SSE-NEXT:    shlq $4, %rbx
+; SSE-NEXT:    orq %r12, %rbx
+; SSE-NEXT:    movq %rbx, %r12
+; SSE-NEXT:    andq %rsi, %r12
+; SSE-NEXT:    shrq $2, %rbx
+; SSE-NEXT:    andq %rsi, %rbx
+; SSE-NEXT:    leaq (%rbx,%r12,4), %rbx
+; SSE-NEXT:    movq %rbx, %r12
+; SSE-NEXT:    andq %r8, %r12
+; SSE-NEXT:    shrq %rbx
+; SSE-NEXT:    andq %r8, %rbx
+; SSE-NEXT:    leaq (%rbx,%r12,2), %rbx
+; SSE-NEXT:    bswapq %r15
+; SSE-NEXT:    movq %r15, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %rdi, %r12
+; SSE-NEXT:    andq %rdi, %r15
+; SSE-NEXT:    shlq $4, %r15
+; SSE-NEXT:    orq %r12, %r15
+; SSE-NEXT:    movq %r15, %r12
+; SSE-NEXT:    andq %rsi, %r12
+; SSE-NEXT:    shrq $2, %r15
+; SSE-NEXT:    andq %rsi, %r15
+; SSE-NEXT:    leaq (%r15,%r12,4), %r15
+; SSE-NEXT:    movq %r15, %r12
+; SSE-NEXT:    andq %r8, %r12
+; SSE-NEXT:    shrq %r15
+; SSE-NEXT:    andq %r8, %r15
+; SSE-NEXT:    bswapq %r14
+; SSE-NEXT:    leaq (%r15,%r12,2), %r15
+; SSE-NEXT:    movq %r14, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %rdi, %r12
+; SSE-NEXT:    andq %rdi, %r14
+; SSE-NEXT:    shlq $4, %r14
+; SSE-NEXT:    orq %r12, %r14
+; SSE-NEXT:    movq %r14, %r12
+; SSE-NEXT:    andq %rsi, %r12
+; SSE-NEXT:    shrq $2, %r14
+; SSE-NEXT:    andq %rsi, %r14
+; SSE-NEXT:    leaq (%r14,%r12,4), %r14
+; SSE-NEXT:    movq %r14, %r12
+; SSE-NEXT:    andq %r8, %r12
+; SSE-NEXT:    shrq %r14
+; SSE-NEXT:    andq %r8, %r14
+; SSE-NEXT:    leaq (%r14,%r12,2), %r14
+; SSE-NEXT:    bswapq %r11
+; SSE-NEXT:    movq %r11, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %rdi, %r12
+; SSE-NEXT:    andq %rdi, %r11
+; SSE-NEXT:    shlq $4, %r11
+; SSE-NEXT:    orq %r12, %r11
+; SSE-NEXT:    movq %r11, %r12
+; SSE-NEXT:    andq %rsi, %r12
+; SSE-NEXT:    shrq $2, %r11
+; SSE-NEXT:    andq %rsi, %r11
+; SSE-NEXT:    leaq (%r11,%r12,4), %r11
+; SSE-NEXT:    movq %r11, %r12
+; SSE-NEXT:    andq %r8, %r12
+; SSE-NEXT:    shrq %r11
+; SSE-NEXT:    andq %r8, %r11
+; SSE-NEXT:    bswapq %r9
+; SSE-NEXT:    leaq (%r11,%r12,2), %r11
+; SSE-NEXT:    movq %r9, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %rdi, %r12
+; SSE-NEXT:    andq %rdi, %r9
+; SSE-NEXT:    shlq $4, %r9
+; SSE-NEXT:    orq %r12, %r9
+; SSE-NEXT:    movq %r9, %r12
+; SSE-NEXT:    andq %rsi, %r12
+; SSE-NEXT:    shrq $2, %r9
+; SSE-NEXT:    andq %rsi, %r9
+; SSE-NEXT:    leaq (%r9,%r12,4), %r9
+; SSE-NEXT:    movq %r9, %r12
+; SSE-NEXT:    andq %r8, %r12
+; SSE-NEXT:    shrq %r9
+; SSE-NEXT:    andq %r8, %r9
+; SSE-NEXT:    leaq (%r9,%r12,2), %r9
+; SSE-NEXT:    bswapq %rcx
+; SSE-NEXT:    movq %rcx, %r12
+; SSE-NEXT:    shrq $4, %r12
+; SSE-NEXT:    andq %rdi, %r12
+; SSE-NEXT:    andq %rdi, %rcx
+; SSE-NEXT:    shlq $4, %rcx
+; SSE-NEXT:    orq %r12, %rcx
+; SSE-NEXT:    movq %rcx, %rdi
+; SSE-NEXT:    andq %rsi, %rdi
+; SSE-NEXT:    shrq $2, %rcx
+; SSE-NEXT:    andq %rsi, %rcx
+; SSE-NEXT:    leaq (%rcx,%rdi,4), %rcx
+; SSE-NEXT:    movq %rcx, %rsi
+; SSE-NEXT:    andq %r8, %rsi
+; SSE-NEXT:    shrq %rcx
+; SSE-NEXT:    andq %r8, %rcx
+; SSE-NEXT:    leaq (%rcx,%rsi,2), %rcx
+; SSE-NEXT:    movq %rcx, 56(%rax)
+; SSE-NEXT:    movq %r9, 48(%rax)
+; SSE-NEXT:    movq %r11, 40(%rax)
+; SSE-NEXT:    movq %r14, 32(%rax)
+; SSE-NEXT:    movq %r15, 24(%rax)
+; SSE-NEXT:    movq %rbx, 16(%rax)
+; SSE-NEXT:    movq %r10, 8(%rax)
+; SSE-NEXT:    movq %rdx, (%rax)
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: bitreverse_i512_load:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movbeq 56(%rsi), %rdi
+; AVX2-NEXT:    movq %rdi, %rcx
+; AVX2-NEXT:    shrq $4, %rcx
+; AVX2-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; AVX2-NEXT:    andq %rdx, %rcx
+; AVX2-NEXT:    andq %rdx, %rdi
+; AVX2-NEXT:    shlq $4, %rdi
+; AVX2-NEXT:    orq %rcx, %rdi
+; AVX2-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; AVX2-NEXT:    movq %rdi, %r8
+; AVX2-NEXT:    andq %rcx, %r8
+; AVX2-NEXT:    shrq $2, %rdi
+; AVX2-NEXT:    andq %rcx, %rdi
+; AVX2-NEXT:    leaq (%rdi,%r8,4), %r8
+; AVX2-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX2-NEXT:    movq %r8, %r9
+; AVX2-NEXT:    andq %rdi, %r9
+; AVX2-NEXT:    shrq %r8
+; AVX2-NEXT:    andq %rdi, %r8
+; AVX2-NEXT:    movbeq 48(%rsi), %r10
+; AVX2-NEXT:    leaq (%r8,%r9,2), %r8
+; AVX2-NEXT:    movq %r10, %r9
+; AVX2-NEXT:    shrq $4, %r9
+; AVX2-NEXT:    andq %rdx, %r9
+; AVX2-NEXT:    andq %rdx, %r10
+; AVX2-NEXT:    shlq $4, %r10
+; AVX2-NEXT:    orq %r9, %r10
+; AVX2-NEXT:    movq %r10, %r9
+; AVX2-NEXT:    andq %rcx, %r9
+; AVX2-NEXT:    shrq $2, %r10
+; AVX2-NEXT:    andq %rcx, %r10
+; AVX2-NEXT:    leaq (%r10,%r9,4), %r9
+; AVX2-NEXT:    movq %r9, %r10
+; AVX2-NEXT:    andq %rdi, %r10
+; AVX2-NEXT:    shrq %r9
+; AVX2-NEXT:    andq %rdi, %r9
+; AVX2-NEXT:    leaq (%r9,%r10,2), %r9
+; AVX2-NEXT:    movbeq 40(%rsi), %r10
+; AVX2-NEXT:    movq %r10, %r11
+; AVX2-NEXT:    shrq $4, %r11
+; AVX2-NEXT:    andq %rdx, %r11
+; AVX2-NEXT:    andq %rdx, %r10
+; AVX2-NEXT:    shlq $4, %r10
+; AVX2-NEXT:    orq %r11, %r10
+; AVX2-NEXT:    movq %r10, %r11
+; AVX2-NEXT:    andq %rcx, %r11
+; AVX2-NEXT:    shrq $2, %r10
+; AVX2-NEXT:    andq %rcx, %r10
+; AVX2-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX2-NEXT:    movq %r10, %r11
+; AVX2-NEXT:    andq %rdi, %r11
+; AVX2-NEXT:    shrq %r10
+; AVX2-NEXT:    andq %rdi, %r10
+; AVX2-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX2-NEXT:    movbeq 32(%rsi), %r11
+; AVX2-NEXT:    movq %r11, %rbx
+; AVX2-NEXT:    shrq $4, %rbx
+; AVX2-NEXT:    andq %rdx, %rbx
+; AVX2-NEXT:    andq %rdx, %r11
+; AVX2-NEXT:    shlq $4, %r11
+; AVX2-NEXT:    orq %rbx, %r11
+; AVX2-NEXT:    movq %r11, %rbx
+; AVX2-NEXT:    andq %rcx, %rbx
+; AVX2-NEXT:    shrq $2, %r11
+; AVX2-NEXT:    andq %rcx, %r11
+; AVX2-NEXT:    leaq (%r11,%rbx,4), %r11
+; AVX2-NEXT:    movq %r11, %rbx
+; AVX2-NEXT:    andq %rdi, %rbx
+; AVX2-NEXT:    shrq %r11
+; AVX2-NEXT:    andq %rdi, %r11
+; AVX2-NEXT:    leaq (%r11,%rbx,2), %r11
+; AVX2-NEXT:    movbeq 24(%rsi), %rbx
+; AVX2-NEXT:    movq %rbx, %r14
+; AVX2-NEXT:    shrq $4, %r14
+; AVX2-NEXT:    andq %rdx, %r14
+; AVX2-NEXT:    andq %rdx, %rbx
+; AVX2-NEXT:    shlq $4, %rbx
+; AVX2-NEXT:    orq %r14, %rbx
+; AVX2-NEXT:    movq %rbx, %r14
+; AVX2-NEXT:    andq %rcx, %r14
+; AVX2-NEXT:    shrq $2, %rbx
+; AVX2-NEXT:    andq %rcx, %rbx
+; AVX2-NEXT:    leaq (%rbx,%r14,4), %rbx
+; AVX2-NEXT:    movq %rbx, %r14
+; AVX2-NEXT:    andq %rdi, %r14
+; AVX2-NEXT:    shrq %rbx
+; AVX2-NEXT:    andq %rdi, %rbx
+; AVX2-NEXT:    movbeq 16(%rsi), %r15
+; AVX2-NEXT:    leaq (%rbx,%r14,2), %rbx
+; AVX2-NEXT:    movq %r15, %r14
+; AVX2-NEXT:    shrq $4, %r14
+; AVX2-NEXT:    andq %rdx, %r14
+; AVX2-NEXT:    andq %rdx, %r15
+; AVX2-NEXT:    shlq $4, %r15
+; AVX2-NEXT:    orq %r14, %r15
+; AVX2-NEXT:    movq %r15, %r14
+; AVX2-NEXT:    andq %rcx, %r14
+; AVX2-NEXT:    shrq $2, %r15
+; AVX2-NEXT:    andq %rcx, %r15
+; AVX2-NEXT:    leaq (%r15,%r14,4), %r14
+; AVX2-NEXT:    movq %r14, %r15
+; AVX2-NEXT:    andq %rdi, %r15
+; AVX2-NEXT:    shrq %r14
+; AVX2-NEXT:    andq %rdi, %r14
+; AVX2-NEXT:    leaq (%r14,%r15,2), %r14
+; AVX2-NEXT:    movbeq 8(%rsi), %r15
+; AVX2-NEXT:    movq %r15, %r12
+; AVX2-NEXT:    shrq $4, %r12
+; AVX2-NEXT:    andq %rdx, %r12
+; AVX2-NEXT:    andq %rdx, %r15
+; AVX2-NEXT:    shlq $4, %r15
+; AVX2-NEXT:    orq %r12, %r15
+; AVX2-NEXT:    movq %r15, %r12
+; AVX2-NEXT:    andq %rcx, %r12
+; AVX2-NEXT:    shrq $2, %r15
+; AVX2-NEXT:    andq %rcx, %r15
+; AVX2-NEXT:    leaq (%r15,%r12,4), %r15
+; AVX2-NEXT:    movq %r15, %r12
+; AVX2-NEXT:    andq %rdi, %r12
+; AVX2-NEXT:    shrq %r15
+; AVX2-NEXT:    andq %rdi, %r15
+; AVX2-NEXT:    leaq (%r15,%r12,2), %r15
+; AVX2-NEXT:    movbeq (%rsi), %rsi
+; AVX2-NEXT:    movq %rsi, %r12
+; AVX2-NEXT:    shrq $4, %r12
+; AVX2-NEXT:    andq %rdx, %r12
+; AVX2-NEXT:    andq %rdx, %rsi
+; AVX2-NEXT:    shlq $4, %rsi
+; AVX2-NEXT:    orq %r12, %rsi
+; AVX2-NEXT:    movq %rsi, %rdx
+; AVX2-NEXT:    andq %rcx, %rdx
+; AVX2-NEXT:    shrq $2, %rsi
+; AVX2-NEXT:    andq %rcx, %rsi
+; AVX2-NEXT:    leaq (%rsi,%rdx,4), %rcx
+; AVX2-NEXT:    movq %rcx, %rdx
+; AVX2-NEXT:    andq %rdi, %rdx
+; AVX2-NEXT:    shrq %rcx
+; AVX2-NEXT:    andq %rdi, %rcx
+; AVX2-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX2-NEXT:    movq %rcx, 56(%rax)
+; AVX2-NEXT:    movq %r15, 48(%rax)
+; AVX2-NEXT:    movq %r14, 40(%rax)
+; AVX2-NEXT:    movq %rbx, 32(%rax)
+; AVX2-NEXT:    movq %r11, 24(%rax)
+; AVX2-NEXT:    movq %r10, 16(%rax)
+; AVX2-NEXT:    movq %r9, 8(%rax)
+; AVX2-NEXT:    movq %r8, (%rax)
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: bitreverse_i512_load:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    pushq %r15
+; AVX512F-NEXT:    pushq %r14
+; AVX512F-NEXT:    pushq %r12
+; AVX512F-NEXT:    pushq %rbx
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    movbeq 56(%rsi), %rdi
+; AVX512F-NEXT:    movq %rdi, %rcx
+; AVX512F-NEXT:    shrq $4, %rcx
+; AVX512F-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; AVX512F-NEXT:    andq %rdx, %rcx
+; AVX512F-NEXT:    andq %rdx, %rdi
+; AVX512F-NEXT:    shlq $4, %rdi
+; AVX512F-NEXT:    orq %rcx, %rdi
+; AVX512F-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; AVX512F-NEXT:    movq %rdi, %r8
+; AVX512F-NEXT:    andq %rcx, %r8
+; AVX512F-NEXT:    shrq $2, %rdi
+; AVX512F-NEXT:    andq %rcx, %rdi
+; AVX512F-NEXT:    leaq (%rdi,%r8,4), %r8
+; AVX512F-NEXT:    movabsq $6148914691236517205, %rdi # imm = 0x5555555555555555
+; AVX512F-NEXT:    movq %r8, %r9
+; AVX512F-NEXT:    andq %rdi, %r9
+; AVX512F-NEXT:    shrq %r8
+; AVX512F-NEXT:    andq %rdi, %r8
+; AVX512F-NEXT:    movbeq 48(%rsi), %r10
+; AVX512F-NEXT:    leaq (%r8,%r9,2), %r8
+; AVX512F-NEXT:    movq %r10, %r9
+; AVX512F-NEXT:    shrq $4, %r9
+; AVX512F-NEXT:    andq %rdx, %r9
+; AVX512F-NEXT:    andq %rdx, %r10
+; AVX512F-NEXT:    shlq $4, %r10
+; AVX512F-NEXT:    orq %r9, %r10
+; AVX512F-NEXT:    movq %r10, %r9
+; AVX512F-NEXT:    andq %rcx, %r9
+; AVX512F-NEXT:    shrq $2, %r10
+; AVX512F-NEXT:    andq %rcx, %r10
+; AVX512F-NEXT:    leaq (%r10,%r9,4), %r9
+; AVX512F-NEXT:    movq %r9, %r10
+; AVX512F-NEXT:    andq %rdi, %r10
+; AVX512F-NEXT:    shrq %r9
+; AVX512F-NEXT:    andq %rdi, %r9
+; AVX512F-NEXT:    leaq (%r9,%r10,2), %r9
+; AVX512F-NEXT:    movbeq 40(%rsi), %r10
+; AVX512F-NEXT:    movq %r10, %r11
+; AVX512F-NEXT:    shrq $4, %r11
+; AVX512F-NEXT:    andq %rdx, %r11
+; AVX512F-NEXT:    andq %rdx, %r10
+; AVX512F-NEXT:    shlq $4, %r10
+; AVX512F-NEXT:    orq %r11, %r10
+; AVX512F-NEXT:    movq %r10, %r11
+; AVX512F-NEXT:    andq %rcx, %r11
+; AVX512F-NEXT:    shrq $2, %r10
+; AVX512F-NEXT:    andq %rcx, %r10
+; AVX512F-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512F-NEXT:    movq %r10, %r11
+; AVX512F-NEXT:    andq %rdi, %r11
+; AVX512F-NEXT:    shrq %r10
+; AVX512F-NEXT:    andq %rdi, %r10
+; AVX512F-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512F-NEXT:    movbeq 32(%rsi), %r11
+; AVX512F-NEXT:    movq %r11, %rbx
+; AVX512F-NEXT:    shrq $4, %rbx
+; AVX512F-NEXT:    andq %rdx, %rbx
+; AVX512F-NEXT:    andq %rdx, %r11
+; AVX512F-NEXT:    shlq $4, %r11
+; AVX512F-NEXT:    orq %rbx, %r11
+; AVX512F-NEXT:    movq %r11, %rbx
+; AVX512F-NEXT:    andq %rcx, %rbx
+; AVX512F-NEXT:    shrq $2, %r11
+; AVX512F-NEXT:    andq %rcx, %r11
+; AVX512F-NEXT:    leaq (%r11,%rbx,4), %r11
+; AVX512F-NEXT:    movq %r11, %rbx
+; AVX512F-NEXT:    andq %rdi, %rbx
+; AVX512F-NEXT:    shrq %r11
+; AVX512F-NEXT:    andq %rdi, %r11
+; AVX512F-NEXT:    leaq (%r11,%rbx,2), %r11
+; AVX512F-NEXT:    movbeq 24(%rsi), %rbx
+; AVX512F-NEXT:    movq %rbx, %r14
+; AVX512F-NEXT:    shrq $4, %r14
+; AVX512F-NEXT:    andq %rdx, %r14
+; AVX512F-NEXT:    andq %rdx, %rbx
+; AVX512F-NEXT:    shlq $4, %rbx
+; AVX512F-NEXT:    orq %r14, %rbx
+; AVX512F-NEXT:    movq %rbx, %r14
+; AVX512F-NEXT:    andq %rcx, %r14
+; AVX512F-NEXT:    shrq $2, %rbx
+; AVX512F-NEXT:    andq %rcx, %rbx
+; AVX512F-NEXT:    leaq (%rbx,%r14,4), %rbx
+; AVX512F-NEXT:    movq %rbx, %r14
+; AVX512F-NEXT:    andq %rdi, %r14
+; AVX512F-NEXT:    shrq %rbx
+; AVX512F-NEXT:    andq %rdi, %rbx
+; AVX512F-NEXT:    movbeq 16(%rsi), %r15
+; AVX512F-NEXT:    leaq (%rbx,%r14,2), %rbx
+; AVX512F-NEXT:    movq %r15, %r14
+; AVX512F-NEXT:    shrq $4, %r14
+; AVX512F-NEXT:    andq %rdx, %r14
+; AVX512F-NEXT:    andq %rdx, %r15
+; AVX512F-NEXT:    shlq $4, %r15
+; AVX512F-NEXT:    orq %r14, %r15
+; AVX512F-NEXT:    movq %r15, %r14
+; AVX512F-NEXT:    andq %rcx, %r14
+; AVX512F-NEXT:    shrq $2, %r15
+; AVX512F-NEXT:    andq %rcx, %r15
+; AVX512F-NEXT:    leaq (%r15,%r14,4), %r14
+; AVX512F-NEXT:    movq %r14, %r15
+; AVX512F-NEXT:    andq %rdi, %r15
+; AVX512F-NEXT:    shrq %r14
+; AVX512F-NEXT:    andq %rdi, %r14
+; AVX512F-NEXT:    leaq (%r14,%r15,2), %r14
+; AVX512F-NEXT:    movbeq 8(%rsi), %r15
+; AVX512F-NEXT:    movq %r15, %r12
+; AVX512F-NEXT:    shrq $4, %r12
+; AVX512F-NEXT:    andq %rdx, %r12
+; AVX512F-NEXT:    andq %rdx, %r15
+; AVX512F-NEXT:    shlq $4, %r15
+; AVX512F-NEXT:    orq %r12, %r15
+; AVX512F-NEXT:    movq %r15, %r12
+; AVX512F-NEXT:    andq %rcx, %r12
+; AVX512F-NEXT:    shrq $2, %r15
+; AVX512F-NEXT:    andq %rcx, %r15
+; AVX512F-NEXT:    leaq (%r15,%r12,4), %r15
+; AVX512F-NEXT:    movq %r15, %r12
+; AVX512F-NEXT:    andq %rdi, %r12
+; AVX512F-NEXT:    shrq %r15
+; AVX512F-NEXT:    andq %rdi, %r15
+; AVX512F-NEXT:    leaq (%r15,%r12,2), %r15
+; AVX512F-NEXT:    movbeq (%rsi), %rsi
+; AVX512F-NEXT:    movq %rsi, %r12
+; AVX512F-NEXT:    shrq $4, %r12
+; AVX512F-NEXT:    andq %rdx, %r12
+; AVX512F-NEXT:    andq %rdx, %rsi
+; AVX512F-NEXT:    shlq $4, %rsi
+; AVX512F-NEXT:    orq %r12, %rsi
+; AVX512F-NEXT:    movq %rsi, %rdx
+; AVX512F-NEXT:    andq %rcx, %rdx
+; AVX512F-NEXT:    shrq $2, %rsi
+; AVX512F-NEXT:    andq %rcx, %rsi
+; AVX512F-NEXT:    leaq (%rsi,%rdx,4), %rcx
+; AVX512F-NEXT:    movq %rcx, %rdx
+; AVX512F-NEXT:    andq %rdi, %rdx
+; AVX512F-NEXT:    shrq %rcx
+; AVX512F-NEXT:    andq %rdi, %rcx
+; AVX512F-NEXT:    leaq (%rcx,%rdx,2), %rcx
+; AVX512F-NEXT:    movq %rcx, 56(%rax)
+; AVX512F-NEXT:    movq %r15, 48(%rax)
+; AVX512F-NEXT:    movq %r14, 40(%rax)
+; AVX512F-NEXT:    movq %rbx, 32(%rax)
+; AVX512F-NEXT:    movq %r11, 24(%rax)
+; AVX512F-NEXT:    movq %r10, 16(%rax)
+; AVX512F-NEXT:    movq %r9, 8(%rax)
+; AVX512F-NEXT:    movq %r8, (%rax)
+; AVX512F-NEXT:    popq %rbx
+; AVX512F-NEXT:    popq %r12
+; AVX512F-NEXT:    popq %r14
+; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: bitreverse_i512_load:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    pushq %r15
+; AVX512VL-NEXT:    pushq %r14
+; AVX512VL-NEXT:    pushq %r12
+; AVX512VL-NEXT:    pushq %rbx
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    movbeq 56(%rsi), %rcx
+; AVX512VL-NEXT:    movq %rcx, %rdx
+; AVX512VL-NEXT:    shrq $4, %rdx
+; AVX512VL-NEXT:    movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VL-NEXT:    andq %rdi, %rdx
+; AVX512VL-NEXT:    andq %rdi, %rcx
+; AVX512VL-NEXT:    shlq $4, %rcx
+; AVX512VL-NEXT:    orq %rdx, %rcx
+; AVX512VL-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX512VL-NEXT:    movq %rcx, %r8
+; AVX512VL-NEXT:    andq %rdx, %r8
+; AVX512VL-NEXT:    shrq $2, %rcx
+; AVX512VL-NEXT:    andq %rdx, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%r8,4), %rcx
+; AVX512VL-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; AVX512VL-NEXT:    movq %rcx, %r9
+; AVX512VL-NEXT:    andq %r8, %r9
+; AVX512VL-NEXT:    shrq %rcx
+; AVX512VL-NEXT:    andq %r8, %rcx
+; AVX512VL-NEXT:    leaq (%rcx,%r9,2), %rcx
+; AVX512VL-NEXT:    movbeq 48(%rsi), %r9
+; AVX512VL-NEXT:    movq %r9, %r10
+; AVX512VL-NEXT:    shrq $4, %r10
+; AVX512VL-NEXT:    andq %rdi, %r10
+; AVX512VL-NEXT:    andq %rdi, %r9
+; AVX512VL-NEXT:    shlq $4, %r9
+; AVX512VL-NEXT:    orq %r10, %r9
+; AVX512VL-NEXT:    movq %r9, %r10
+; AVX512VL-NEXT:    andq %rdx, %r10
+; AVX512VL-NEXT:    shrq $2, %r9
+; AVX512VL-NEXT:    andq %rdx, %r9
+; AVX512VL-NEXT:    leaq (%r9,%r10,4), %r9
+; AVX512VL-NEXT:    movq %r9, %r10
+; AVX512VL-NEXT:    andq %r8, %r10
+; AVX512VL-NEXT:    shrq %r9
+; AVX512VL-NEXT:    andq %r8, %r9
+; AVX512VL-NEXT:    leaq (%r9,%r10,2), %r9
+; AVX512VL-NEXT:    movbeq 40(%rsi), %r10
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    shrq $4, %r11
+; AVX512VL-NEXT:    andq %rdi, %r11
+; AVX512VL-NEXT:    andq %rdi, %r10
+; AVX512VL-NEXT:    shlq $4, %r10
+; AVX512VL-NEXT:    orq %r11, %r10
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    andq %rdx, %r11
+; AVX512VL-NEXT:    shrq $2, %r10
+; AVX512VL-NEXT:    andq %rdx, %r10
+; AVX512VL-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512VL-NEXT:    movq %r10, %r11
+; AVX512VL-NEXT:    andq %r8, %r11
+; AVX512VL-NEXT:    shrq %r10
+; AVX512VL-NEXT:    andq %r8, %r10
+; AVX512VL-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512VL-NEXT:    movbeq 32(%rsi), %r11
+; AVX512VL-NEXT:    movq %r11, %rbx
+; AVX512VL-NEXT:    shrq $4, %rbx
+; AVX512VL-NEXT:    andq %rdi, %rbx
+; AVX512VL-NEXT:    andq %rdi, %r11
+; AVX512VL-NEXT:    shlq $4, %r11
+; AVX512VL-NEXT:    orq %rbx, %r11
+; AVX512VL-NEXT:    movq %r11, %rbx
+; AVX512VL-NEXT:    andq %rdx, %rbx
+; AVX512VL-NEXT:    shrq $2, %r11
+; AVX512VL-NEXT:    andq %rdx, %r11
+; AVX512VL-NEXT:    leaq (%r11,%rbx,4), %r11
+; AVX512VL-NEXT:    movq %r11, %rbx
+; AVX512VL-NEXT:    andq %r8, %rbx
+; AVX512VL-NEXT:    shrq %r11
+; AVX512VL-NEXT:    andq %r8, %r11
+; AVX512VL-NEXT:    leaq (%r11,%rbx,2), %r11
+; AVX512VL-NEXT:    movbeq 24(%rsi), %rbx
+; AVX512VL-NEXT:    movq %rbx, %r14
+; AVX512VL-NEXT:    shrq $4, %r14
+; AVX512VL-NEXT:    andq %rdi, %r14
+; AVX512VL-NEXT:    andq %rdi, %rbx
+; AVX512VL-NEXT:    shlq $4, %rbx
+; AVX512VL-NEXT:    orq %r14, %rbx
+; AVX512VL-NEXT:    movq %rbx, %r14
+; AVX512VL-NEXT:    andq %rdx, %r14
+; AVX512VL-NEXT:    shrq $2, %rbx
+; AVX512VL-NEXT:    andq %rdx, %rbx
+; AVX512VL-NEXT:    leaq (%rbx,%r14,4), %rbx
+; AVX512VL-NEXT:    movq %rbx, %r14
+; AVX512VL-NEXT:    andq %r8, %r14
+; AVX512VL-NEXT:    shrq %rbx
+; AVX512VL-NEXT:    andq %r8, %rbx
+; AVX512VL-NEXT:    leaq (%rbx,%r14,2), %rbx
+; AVX512VL-NEXT:    movbeq 16(%rsi), %r14
+; AVX512VL-NEXT:    movq %r14, %r15
+; AVX512VL-NEXT:    shrq $4, %r15
+; AVX512VL-NEXT:    andq %rdi, %r15
+; AVX512VL-NEXT:    andq %rdi, %r14
+; AVX512VL-NEXT:    shlq $4, %r14
+; AVX512VL-NEXT:    orq %r15, %r14
+; AVX512VL-NEXT:    movq %r14, %r15
+; AVX512VL-NEXT:    andq %rdx, %r15
+; AVX512VL-NEXT:    shrq $2, %r14
+; AVX512VL-NEXT:    andq %rdx, %r14
+; AVX512VL-NEXT:    leaq (%r14,%r15,4), %r14
+; AVX512VL-NEXT:    movq %r14, %r15
+; AVX512VL-NEXT:    andq %r8, %r15
+; AVX512VL-NEXT:    shrq %r14
+; AVX512VL-NEXT:    andq %r8, %r14
+; AVX512VL-NEXT:    leaq (%r14,%r15,2), %r14
+; AVX512VL-NEXT:    movbeq 8(%rsi), %r15
+; AVX512VL-NEXT:    movq %r15, %r12
+; AVX512VL-NEXT:    shrq $4, %r12
+; AVX512VL-NEXT:    andq %rdi, %r12
+; AVX512VL-NEXT:    andq %rdi, %r15
+; AVX512VL-NEXT:    shlq $4, %r15
+; AVX512VL-NEXT:    orq %r12, %r15
+; AVX512VL-NEXT:    movq %r15, %r12
+; AVX512VL-NEXT:    andq %rdx, %r12
+; AVX512VL-NEXT:    shrq $2, %r15
+; AVX512VL-NEXT:    andq %rdx, %r15
+; AVX512VL-NEXT:    leaq (%r15,%r12,4), %r15
+; AVX512VL-NEXT:    movq %r15, %r12
+; AVX512VL-NEXT:    andq %r8, %r12
+; AVX512VL-NEXT:    shrq %r15
+; AVX512VL-NEXT:    andq %r8, %r15
+; AVX512VL-NEXT:    leaq (%r15,%r12,2), %r15
+; AVX512VL-NEXT:    movbeq (%rsi), %rsi
+; AVX512VL-NEXT:    movq %rsi, %r12
+; AVX512VL-NEXT:    shrq $4, %r12
+; AVX512VL-NEXT:    andq %rdi, %r12
+; AVX512VL-NEXT:    andq %rdi, %rsi
+; AVX512VL-NEXT:    shlq $4, %rsi
+; AVX512VL-NEXT:    orq %r12, %rsi
+; AVX512VL-NEXT:    movq %rsi, %rdi
+; AVX512VL-NEXT:    andq %rdx, %rdi
+; AVX512VL-NEXT:    shrq $2, %rsi
+; AVX512VL-NEXT:    andq %rdx, %rsi
+; AVX512VL-NEXT:    leaq (%rsi,%rdi,4), %rdx
+; AVX512VL-NEXT:    movq %rdx, %rsi
+; AVX512VL-NEXT:    andq %r8, %rsi
+; AVX512VL-NEXT:    shrq %rdx
+; AVX512VL-NEXT:    andq %r8, %rdx
+; AVX512VL-NEXT:    leaq (%rdx,%rsi,2), %rdx
+; AVX512VL-NEXT:    movq %rdx, 56(%rax)
+; AVX512VL-NEXT:    movq %r15, 48(%rax)
+; AVX512VL-NEXT:    movq %r14, 40(%rax)
+; AVX512VL-NEXT:    movq %rbx, 32(%rax)
+; AVX512VL-NEXT:    movq %r11, 24(%rax)
+; AVX512VL-NEXT:    movq %r10, 16(%rax)
+; AVX512VL-NEXT:    movq %r9, 8(%rax)
+; AVX512VL-NEXT:    movq %rcx, (%rax)
+; AVX512VL-NEXT:    popq %rbx
+; AVX512VL-NEXT:    popq %r12
+; AVX512VL-NEXT:    popq %r14
+; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: bitreverse_i512_load:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    pushq %r15
+; AVX512VBMI-NEXT:    pushq %r14
+; AVX512VBMI-NEXT:    pushq %r12
+; AVX512VBMI-NEXT:    pushq %rbx
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    movbeq 56(%rsi), %rcx
+; AVX512VBMI-NEXT:    movq %rcx, %rdx
+; AVX512VBMI-NEXT:    shrq $4, %rdx
+; AVX512VBMI-NEXT:    movabsq $1085102592571150095, %rdi # imm = 0xF0F0F0F0F0F0F0F
+; AVX512VBMI-NEXT:    andq %rdi, %rdx
+; AVX512VBMI-NEXT:    andq %rdi, %rcx
+; AVX512VBMI-NEXT:    shlq $4, %rcx
+; AVX512VBMI-NEXT:    orq %rdx, %rcx
+; AVX512VBMI-NEXT:    movabsq $3689348814741910323, %rdx # imm = 0x3333333333333333
+; AVX512VBMI-NEXT:    movq %rcx, %r8
+; AVX512VBMI-NEXT:    andq %rdx, %r8
+; AVX512VBMI-NEXT:    shrq $2, %rcx
+; AVX512VBMI-NEXT:    andq %rdx, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%r8,4), %rcx
+; AVX512VBMI-NEXT:    movabsq $6148914691236517205, %r8 # imm = 0x5555555555555555
+; AVX512VBMI-NEXT:    movq %rcx, %r9
+; AVX512VBMI-NEXT:    andq %r8, %r9
+; AVX512VBMI-NEXT:    shrq %rcx
+; AVX512VBMI-NEXT:    andq %r8, %rcx
+; AVX512VBMI-NEXT:    leaq (%rcx,%r9,2), %rcx
+; AVX512VBMI-NEXT:    movbeq 48(%rsi), %r9
+; AVX512VBMI-NEXT:    movq %r9, %r10
+; AVX512VBMI-NEXT:    shrq $4, %r10
+; AVX512VBMI-NEXT:    andq %rdi, %r10
+; AVX512VBMI-NEXT:    andq %rdi, %r9
+; AVX512VBMI-NEXT:    shlq $4, %r9
+; AVX512VBMI-NEXT:    orq %r10, %r9
+; AVX512VBMI-NEXT:    movq %r9, %r10
+; AVX512VBMI-NEXT:    andq %rdx, %r10
+; AVX512VBMI-NEXT:    shrq $2, %r9
+; AVX512VBMI-NEXT:    andq %rdx, %r9
+; AVX512VBMI-NEXT:    leaq (%r9,%r10,4), %r9
+; AVX512VBMI-NEXT:    movq %r9, %r10
+; AVX512VBMI-NEXT:    andq %r8, %r10
+; AVX512VBMI-NEXT:    shrq %r9
+; AVX512VBMI-NEXT:    andq %r8, %r9
+; AVX512VBMI-NEXT:    leaq (%r9,%r10,2), %r9
+; AVX512VBMI-NEXT:    movbeq 40(%rsi), %r10
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    shrq $4, %r11
+; AVX512VBMI-NEXT:    andq %rdi, %r11
+; AVX512VBMI-NEXT:    andq %rdi, %r10
+; AVX512VBMI-NEXT:    shlq $4, %r10
+; AVX512VBMI-NEXT:    orq %r11, %r10
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    andq %rdx, %r11
+; AVX512VBMI-NEXT:    shrq $2, %r10
+; AVX512VBMI-NEXT:    andq %rdx, %r10
+; AVX512VBMI-NEXT:    leaq (%r10,%r11,4), %r10
+; AVX512VBMI-NEXT:    movq %r10, %r11
+; AVX512VBMI-NEXT:    andq %r8, %r11
+; AVX512VBMI-NEXT:    shrq %r10
+; AVX512VBMI-NEXT:    andq %r8, %r10
+; AVX512VBMI-NEXT:    leaq (%r10,%r11,2), %r10
+; AVX512VBMI-NEXT:    movbeq 32(%rsi), %r11
+; AVX512VBMI-NEXT:    movq %r11, %rbx
+; AVX512VBMI-NEXT:    shrq $4, %rbx
+; AVX512VBMI-NEXT:    andq %rdi, %rbx
+; AVX512VBMI-NEXT:    andq %rdi, %r11
+; AVX512VBMI-NEXT:    shlq $4, %r11
+; AVX512VBMI-NEXT:    orq %rbx, %r11
+; AVX512VBMI-NEXT:    movq %r11, %rbx
+; AVX512VBMI-NEXT:    andq %rdx, %rbx
+; AVX512VBMI-NEXT:    shrq $2, %r11
+; AVX512VBMI-NEXT:    andq %rdx, %r11
+; AVX512VBMI-NEXT:    leaq (%r11,%rbx,4), %r11
+; AVX512VBMI-NEXT:    movq %r11, %rbx
+; AVX512VBMI-NEXT:    andq %r8, %rbx
+; AVX512VBMI-NEXT:    shrq %r11
+; AVX512VBMI-NEXT:    andq %r8, %r11
+; AVX512VBMI-NEXT:    leaq (%r11,%rbx,2), %r11
+; AVX512VBMI-NEXT:    movbeq 24(%rsi), %rbx
+; AVX512VBMI-NEXT:    movq %rbx, %r14
+; AVX512VBMI-NEXT:    shrq $4, %r14
+; AVX512VBMI-NEXT:    andq %rdi, %r14
+; AVX512VBMI-NEXT:    andq %rdi, %rbx
+; AVX512VBMI-NEXT:    shlq $4, %rbx
+; AVX512VBMI-NEXT:    orq %r14, %rbx
+; AVX512VBMI-NEXT:    movq %rbx, %r14
+; AVX512VBMI-NEXT:    andq %rdx, %r14
+; AVX512VBMI-NEXT:    shrq $2, %rbx
+; AVX512VBMI-NEXT:    andq %rdx, %rbx
+; AVX512VBMI-NEXT:    leaq (%rbx,%r14,4), %rbx
+; AVX512VBMI-NEXT:    movq %rbx, %r14
+; AVX512VBMI-NEXT:    andq %r8, %r14
+; AVX512VBMI-NEXT:    shrq %rbx
+; AVX512VBMI-NEXT:    andq %r8, %rbx
+; AVX512VBMI-NEXT:    leaq (%rbx,%r14,2), %rbx
+; AVX512VBMI-NEXT:    movbeq 16(%rsi), %r14
+; AVX512VBMI-NEXT:    movq %r14, %r15
+; AVX512VBMI-NEXT:    shrq $4, %r15
+; AVX512VBMI-NEXT:    andq %rdi, %r15
+; AVX512VBMI-NEXT:    andq %rdi, %r14
+; AVX512VBMI-NEXT:    shlq $4, %r14
+; AVX512VBMI-NEXT:    orq %r15, %r14
+; AVX512VBMI-NEXT:    movq %r14, %r15
+; AVX512VBMI-NEXT:    andq %rdx, %r15
+; AVX512VBMI-NEXT:    shrq $2, %r14
+; AVX512VBMI-NEXT:    andq %rdx, %r14
+; AVX512VBMI-NEXT:    leaq (%r14,%r15,4), %r14
+; AVX512VBMI-NEXT:    movq %r14, %r15
+; AVX512VBMI-NEXT:    andq %r8, %r15
+; AVX512VBMI-NEXT:    shrq %r14
+; AVX512VBMI-NEXT:    andq %r8, %r14
+; AVX512VBMI-NEXT:    leaq (%r14,%r15,2), %r14
+; AVX512VBMI-NEXT:    movbeq 8(%rsi), %r15
+; AVX512VBMI-NEXT:    movq %r15, %r12
+; AVX512VBMI-NEXT:    shrq $4, %r12
+; AVX512VBMI-NEXT:    andq %rdi, %r12
+; AVX512VBMI-NEXT:    andq %rdi, %r15
+; AVX512VBMI-NEXT:    shlq $4, %r15
+; AVX512VBMI-NEXT:    orq %r12, %r15
+; AVX512VBMI-NEXT:    movq %r15, %r12
+; AVX512VBMI-NEXT:    andq %rdx, %r12
+; AVX512VBMI-NEXT:    shrq $2, %r15
+; AVX512VBMI-NEXT:    andq %rdx, %r15
+; AVX512VBMI-NEXT:    leaq (%r15,%r12,4), %r15
+; AVX512VBMI-NEXT:    movq %r15, %r12
+; AVX512VBMI-NEXT:    andq %r8, %r12
+; AVX512VBMI-NEXT:    shrq %r15
+; AVX512VBMI-NEXT:    andq %r8, %r15
+; AVX512VBMI-NEXT:    leaq (%r15,%r12,2), %r15
+; AVX512VBMI-NEXT:    movbeq (%rsi), %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %r12
+; AVX512VBMI-NEXT:    shrq $4, %r12
+; AVX512VBMI-NEXT:    andq %rdi, %r12
+; AVX512VBMI-NEXT:    andq %rdi, %rsi
+; AVX512VBMI-NEXT:    shlq $4, %rsi
+; AVX512VBMI-NEXT:    orq %r12, %rsi
+; AVX512VBMI-NEXT:    movq %rsi, %rdi
+; AVX512VBMI-NEXT:    andq %rdx, %rdi
+; AVX512VBMI-NEXT:    shrq $2, %rsi
+; AVX512VBMI-NEXT:    andq %rdx, %rsi
+; AVX512VBMI-NEXT:    leaq (%rsi,%rdi,4), %rdx
+; AVX512VBMI-NEXT:    movq %rdx, %rsi
+; AVX512VBMI-NEXT:    andq %r8, %rsi
+; AVX512VBMI-NEXT:    shrq %rdx
+; AVX512VBMI-NEXT:    andq %r8, %rdx
+; AVX512VBMI-NEXT:    leaq (%rdx,%rsi,2), %rdx
+; AVX512VBMI-NEXT:    movq %rdx, 56(%rax)
+; AVX512VBMI-NEXT:    movq %r15, 48(%rax)
+; AVX512VBMI-NEXT:    movq %r14, 40(%rax)
+; AVX512VBMI-NEXT:    movq %rbx, 32(%rax)
+; AVX512VBMI-NEXT:    movq %r11, 24(%rax)
+; AVX512VBMI-NEXT:    movq %r10, 16(%rax)
+; AVX512VBMI-NEXT:    movq %r9, 8(%rax)
+; AVX512VBMI-NEXT:    movq %rcx, (%rax)
+; AVX512VBMI-NEXT:    popq %rbx
+; AVX512VBMI-NEXT:    popq %r12
+; AVX512VBMI-NEXT:    popq %r14
+; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    retq
+  %a0 = load i512, ptr %p0
+  %res = call i512 @llvm.bitreverse.i512(i512 %a0)
+  ret i512 %res
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; AVX512: {{.*}}
 ; CHECK: {{.*}}



More information about the llvm-commits mailing list