[llvm] [X86] Enable i512 fshl/fshr lowering on avx512 targets (PR #185615)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 11 03:25:12 PDT 2026
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/185615
>From e3e6008fa22a30109092c3103f4c0d5a0c8212b1 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 10 Mar 2026 10:56:32 +0000
Subject: [PATCH 1/4] [X86] Enable i512 fshl/fshr lowering on avx512 targets
If freely foldable to vector, expand to a shift pair, along with zero-amount handling.
Unfortunately using the generic TargetLowering::expandFunnelShift results in additional shifts/scalarizations instead of staying on the FPU.
Fixes #184828
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 54 +-
llvm/test/CodeGen/X86/funnel-shift-i512.ll | 2172 +++++++-------------
2 files changed, 743 insertions(+), 1483 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 9d158ad919ebb..62871e0743c0c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1891,6 +1891,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRL, MVT::i512, Custom);
setOperationAction(ISD::SHL, MVT::i512, Custom);
setOperationAction(ISD::SRA, MVT::i512, Custom);
+ setOperationAction(ISD::FSHR, MVT::i512, Custom);
+ setOperationAction(ISD::FSHL, MVT::i512, Custom);
setOperationAction(ISD::FSHR, MVT::i256, Custom);
setOperationAction(ISD::FSHL, MVT::i256, Custom);
setOperationAction(ISD::SELECT, MVT::i512, Custom);
@@ -34525,30 +34527,56 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::FSHL:
case ISD::FSHR: {
+ EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
SDValue Amt = N->getOperand(2);
assert(Subtarget.useAVX512Regs() && "AVX512F required");
- assert(N->getValueType(0) == MVT::i256 && "Unexpected VT!");
+ assert((VT == MVT::i256 || VT == MVT::i512) && "Unexpected VT!");
if (!mayFoldIntoVector(Op0, DAG, Subtarget) ||
!mayFoldIntoVector(Op1, DAG, Subtarget))
return;
+ unsigned BW = VT.getSizeInBits();
+ MVT AmtVT = MVT::i64;
+ MVT VecVT = MVT::getVectorVT(MVT::i64, BW / 64);
+ MVT BoolVT = MVT::getVectorVT(MVT::i1, BW / 64);
+ Amt = DAG.getNode(ISD::AND, dl, AmtVT, DAG.getZExtOrTrunc(Amt, dl, AmtVT),
+ DAG.getConstant(BW - 1, dl, AmtVT));
+
// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
- SDValue Res = DAG.getBitcast(
- MVT::i512, concatSubVectors(DAG.getBitcast(MVT::v4i64, Op1),
- DAG.getBitcast(MVT::v4i64, Op0), DAG, dl));
- Amt = DAG.getNode(ISD::AND, dl, Amt.getValueType(), Amt,
- DAG.getConstant(255, dl, Amt.getValueType()));
- if (Opc == ISD::FSHL) {
- Res = DAG.getNode(ISD::SHL, dl, MVT::i512, Res, Amt);
- Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res,
- DAG.getShiftAmountConstant(256, MVT::i512, dl));
- } else {
- Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res, Amt);
+ if (VT == MVT::i256) {
+ SDValue Res = concatSubVectors(DAG.getBitcast(MVT::v4i64, Op1),
+ DAG.getBitcast(MVT::v4i64, Op0), DAG, dl);
+ Res = DAG.getBitcast(MVT::i512, Res);
+ if (Opc == ISD::FSHL) {
+ Res = DAG.getNode(ISD::SHL, dl, MVT::i512, Res, Amt);
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res,
+ DAG.getShiftAmountConstant(256, MVT::i512, dl));
+ } else {
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res, Amt);
+ }
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i256, Res));
+ return;
}
- Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i256, Res));
+
+ // fshl: z == 0 ? x : (x << (z % bw) | y >> (bw - (z % bw)))
+ // fshr: z == 0 ? y : (x << (bw - (z % bw)) | y >> (z % bw))
+ SDValue AmtZ = DAG.getSetCC(dl, MVT::i1, Amt, DAG.getConstant(0, dl, AmtVT),
+ ISD::SETNE);
+ SDValue Sel = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, AmtZ);
+ SDValue InvAmt = DAG.getNode(ISD::SUB, dl, AmtVT,
+ DAG.getConstant(BW - 1, dl, AmtVT), Amt);
+ SDValue ShX =
+ DAG.getNode(ISD::SHL, dl, VT, Op0, Opc == ISD::FSHL ? Amt : InvAmt);
+ SDValue ShY =
+ DAG.getNode(ISD::SRL, dl, VT, Op1, Opc == ISD::FSHR ? Amt : InvAmt);
+ SDValue Res = DAG.getNode(ISD::OR, dl, VecVT, DAG.getBitcast(VecVT, ShX),
+ DAG.getBitcast(VecVT, ShY));
+ Res = DAG.getSelect(dl, VecVT, DAG.getBitcast(BoolVT, Sel), Res,
+ DAG.getBitcast(VecVT, Opc == ISD::FSHL ? Op0 : Op1));
+ Results.push_back(DAG.getBitcast(VT, Res));
return;
}
case ISD::CTPOP: {
diff --git a/llvm/test/CodeGen/X86/funnel-shift-i512.ll b/llvm/test/CodeGen/X86/funnel-shift-i512.ll
index af48bff3cdff7..35c67a696dfef 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-i512.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-i512.ll
@@ -1837,263 +1837,158 @@ define i512 @fshl_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshl_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq (%rsi), %r11
-; AVX512F-NEXT: movq 8(%rsi), %r10
-; AVX512F-NEXT: movq 16(%rsi), %r9
-; AVX512F-NEXT: movq 24(%rsi), %r8
-; AVX512F-NEXT: movq 32(%rdx), %r14
-; AVX512F-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT: movq 56(%rsi), %rax
-; AVX512F-NEXT: cmovneq %r8, %rax
-; AVX512F-NEXT: movq 40(%rdx), %r13
-; AVX512F-NEXT: movq 48(%rsi), %rbp
-; AVX512F-NEXT: cmovneq %r9, %rbp
-; AVX512F-NEXT: movq 48(%rdx), %r15
-; AVX512F-NEXT: movq 40(%rsi), %rbx
-; AVX512F-NEXT: cmovneq %r10, %rbx
-; AVX512F-NEXT: movq 56(%rdx), %r12
-; AVX512F-NEXT: movq 32(%rsi), %rsi
-; AVX512F-NEXT: cmovneq %r11, %rsi
-; AVX512F-NEXT: cmovneq %r12, %r8
-; AVX512F-NEXT: cmovneq %r15, %r9
-; AVX512F-NEXT: cmovneq %r13, %r10
-; AVX512F-NEXT: cmovneq %r14, %r11
-; AVX512F-NEXT: cmovneq 16(%rdx), %r15
-; AVX512F-NEXT: cmovneq (%rdx), %r14
-; AVX512F-NEXT: cmovneq 8(%rdx), %r13
-; AVX512F-NEXT: cmovneq 24(%rdx), %r12
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: cmoveq %r12, %r13
-; AVX512F-NEXT: cmoveq %r15, %r14
-; AVX512F-NEXT: cmoveq %r11, %r15
-; AVX512F-NEXT: cmoveq %r10, %r12
-; AVX512F-NEXT: cmoveq %r9, %r11
-; AVX512F-NEXT: cmoveq %r8, %r10
-; AVX512F-NEXT: cmoveq %rsi, %r9
-; AVX512F-NEXT: cmoveq %rbx, %r8
-; AVX512F-NEXT: cmoveq %rbp, %rsi
-; AVX512F-NEXT: cmoveq %rax, %rbx
+; AVX512F-NEXT: subq $136, %rsp
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vmovups (%rdx), %zmm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm1, (%rsp)
+; AVX512F-NEXT: movl %ecx, %eax
+; AVX512F-NEXT: andl $63, %eax
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpandn %xmm3, %xmm1, %xmm4
+; AVX512F-NEXT: movl %ecx, %eax
+; AVX512F-NEXT: shrl $3, %eax
+; AVX512F-NEXT: andl $56, %eax
+; AVX512F-NEXT: leaq -{{[0-9]+}}(%rsp), %rdx
+; AVX512F-NEXT: subq %rax, %rdx
+; AVX512F-NEXT: vmovdqu64 (%rdx), %zmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm6 = zmm2[7],zmm5[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm6, %zmm6
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpsllq %xmm1, %zmm5, %zmm1
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: cmovneq %rsi, %rbx
-; AVX512F-NEXT: cmovneq %r8, %rsi
-; AVX512F-NEXT: cmovneq %r9, %r8
-; AVX512F-NEXT: cmovneq %r10, %r9
-; AVX512F-NEXT: cmovneq %r11, %r10
-; AVX512F-NEXT: cmovneq %r12, %r11
-; AVX512F-NEXT: cmovneq %r15, %r12
-; AVX512F-NEXT: cmoveq %r13, %r14
-; AVX512F-NEXT: cmovneq %r13, %r15
-; AVX512F-NEXT: movq %r15, %rdx
-; AVX512F-NEXT: shldq %cl, %r14, %rdx
-; AVX512F-NEXT: movq %r12, %rdi
-; AVX512F-NEXT: shldq %cl, %r15, %rdi
-; AVX512F-NEXT: movq %r11, %r14
-; AVX512F-NEXT: shldq %cl, %r12, %r14
-; AVX512F-NEXT: movq %r10, %r15
-; AVX512F-NEXT: shldq %cl, %r11, %r15
-; AVX512F-NEXT: movq %r9, %r11
-; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq %r8, %r10
-; AVX512F-NEXT: shldq %cl, %r9, %r10
-; AVX512F-NEXT: movq %rsi, %r9
-; AVX512F-NEXT: shldq %cl, %r8, %r9
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rsi, %rbx
-; AVX512F-NEXT: movq %rbx, 56(%rax)
-; AVX512F-NEXT: movq %r9, 48(%rax)
-; AVX512F-NEXT: movq %r10, 40(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %r15, 24(%rax)
-; AVX512F-NEXT: movq %r14, 16(%rax)
-; AVX512F-NEXT: movq %rdi, 8(%rax)
-; AVX512F-NEXT: movq %rdx, (%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vporq %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512F-NEXT: movq %rcx, %rdx
+; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512F-NEXT: movl %edx, %esi
+; AVX512F-NEXT: andl $63, %esi
+; AVX512F-NEXT: vmovq %rsi, %xmm4
+; AVX512F-NEXT: vpbroadcastq %xmm4, %xmm4
+; AVX512F-NEXT: vpandn %xmm3, %xmm4, %xmm3
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: vmovdqu64 (%rsp,%rdx,8), %zmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm5[1,2,3,4,5,6,7],zmm2[0]
+; AVX512F-NEXT: vpaddq %zmm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq %xmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm5, %zmm3
+; AVX512F-NEXT: xorl %edx, %edx
+; AVX512F-NEXT: negq %rcx
+; AVX512F-NEXT: sbbl %edx, %edx
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: kmovw %edx, %k1
+; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: addq $136, %rsp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshl_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq (%rsi), %r11
-; AVX512VL-NEXT: movq 8(%rsi), %r9
-; AVX512VL-NEXT: movq 16(%rsi), %rax
-; AVX512VL-NEXT: movq 24(%rsi), %r8
-; AVX512VL-NEXT: movq 32(%rdx), %r15
-; AVX512VL-NEXT: movq 40(%rdx), %r12
-; AVX512VL-NEXT: movq 48(%rdx), %r14
-; AVX512VL-NEXT: movq 56(%rdx), %rbx
-; AVX512VL-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT: movq 56(%rsi), %r13
-; AVX512VL-NEXT: cmovneq %r8, %r13
-; AVX512VL-NEXT: movq 48(%rsi), %rbp
-; AVX512VL-NEXT: cmovneq %rax, %rbp
-; AVX512VL-NEXT: movq 40(%rsi), %r10
-; AVX512VL-NEXT: cmovneq %r9, %r10
-; AVX512VL-NEXT: movq 32(%rsi), %rsi
-; AVX512VL-NEXT: cmovneq %r11, %rsi
-; AVX512VL-NEXT: cmovneq %rbx, %r8
-; AVX512VL-NEXT: cmovneq %r14, %rax
-; AVX512VL-NEXT: cmovneq %r15, %r11
-; AVX512VL-NEXT: cmovneq 16(%rdx), %r14
-; AVX512VL-NEXT: cmovneq (%rdx), %r15
-; AVX512VL-NEXT: cmovneq %r12, %r9
-; AVX512VL-NEXT: cmovneq 8(%rdx), %r12
-; AVX512VL-NEXT: cmovneq 24(%rdx), %rbx
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: cmoveq %rbx, %r12
-; AVX512VL-NEXT: cmoveq %r14, %r15
-; AVX512VL-NEXT: cmoveq %r11, %r14
-; AVX512VL-NEXT: cmoveq %r9, %rbx
-; AVX512VL-NEXT: cmoveq %rax, %r11
-; AVX512VL-NEXT: cmoveq %r8, %r9
-; AVX512VL-NEXT: cmoveq %rsi, %rax
-; AVX512VL-NEXT: cmoveq %r10, %r8
-; AVX512VL-NEXT: cmoveq %rbp, %rsi
-; AVX512VL-NEXT: cmoveq %r13, %r10
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: cmovneq %rsi, %r10
-; AVX512VL-NEXT: cmovneq %r8, %rsi
-; AVX512VL-NEXT: cmovneq %rax, %r8
-; AVX512VL-NEXT: cmovneq %r9, %rax
-; AVX512VL-NEXT: cmovneq %r11, %r9
-; AVX512VL-NEXT: cmovneq %rbx, %r11
-; AVX512VL-NEXT: cmovneq %r14, %rbx
-; AVX512VL-NEXT: cmoveq %r12, %r15
-; AVX512VL-NEXT: cmovneq %r12, %r14
-; AVX512VL-NEXT: movq %r14, %rdx
-; AVX512VL-NEXT: shldq %cl, %r15, %rdx
-; AVX512VL-NEXT: movq %rbx, %r15
-; AVX512VL-NEXT: shldq %cl, %r14, %r15
-; AVX512VL-NEXT: movq %r11, %r14
-; AVX512VL-NEXT: shldq %cl, %rbx, %r14
-; AVX512VL-NEXT: movq %r9, %rbx
-; AVX512VL-NEXT: shldq %cl, %r11, %rbx
-; AVX512VL-NEXT: movq %rax, %r11
-; AVX512VL-NEXT: shldq %cl, %r9, %r11
-; AVX512VL-NEXT: movq %r8, %r9
-; AVX512VL-NEXT: shldq %cl, %rax, %r9
+; AVX512VL-NEXT: subq $136, %rsp
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
+; AVX512VL-NEXT: vmovups (%rdx), %ymm2
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %rsi, %rdi
-; AVX512VL-NEXT: shldq %cl, %r8, %rdi
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %rsi, %r10
-; AVX512VL-NEXT: movq %r10, 56(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r9, 40(%rax)
-; AVX512VL-NEXT: movq %r11, 32(%rax)
-; AVX512VL-NEXT: movq %rbx, 24(%rax)
-; AVX512VL-NEXT: movq %r14, 16(%rax)
-; AVX512VL-NEXT: movq %r15, 8(%rax)
-; AVX512VL-NEXT: movq %rdx, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: vmovups 32(%rdx), %ymm3
+; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm2, (%rsp)
+; AVX512VL-NEXT: movl %ecx, %edx
+; AVX512VL-NEXT: andl $63, %edx
+; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT: movl %ecx, %edx
+; AVX512VL-NEXT: shrl $3, %edx
+; AVX512VL-NEXT: andl $56, %edx
+; AVX512VL-NEXT: leaq -{{[0-9]+}}(%rsp), %rsi
+; AVX512VL-NEXT: subq %rdx, %rsi
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm3
+; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm4
+; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: valignq {{.*#+}} zmm6 = zmm5[7],zmm3[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm6, %zmm6
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vporq %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512VL-NEXT: movq %rcx, %rdx
+; AVX512VL-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512VL-NEXT: movl %edx, %esi
+; AVX512VL-NEXT: andl $63, %esi
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm3
+; AVX512VL-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: vmovdqu64 (%rsp,%rdx,8), %zmm4
+; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT: vpaddq %zmm5, %zmm5, %zmm5
+; AVX512VL-NEXT: vpsllq %xmm2, %zmm5, %zmm2
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: negq %rcx
+; AVX512VL-NEXT: sbbl %edx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT: addq $136, %rsp
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshl_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rbp
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq (%rsi), %r11
-; AVX512VBMI-NEXT: movq 8(%rsi), %r9
-; AVX512VBMI-NEXT: movq 16(%rsi), %rax
-; AVX512VBMI-NEXT: movq 24(%rsi), %r8
-; AVX512VBMI-NEXT: movq 32(%rdx), %r15
-; AVX512VBMI-NEXT: movq 40(%rdx), %r12
-; AVX512VBMI-NEXT: movq 48(%rdx), %r14
-; AVX512VBMI-NEXT: movq 56(%rdx), %rbx
-; AVX512VBMI-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT: movq 56(%rsi), %r13
-; AVX512VBMI-NEXT: cmovneq %r8, %r13
-; AVX512VBMI-NEXT: movq 48(%rsi), %rbp
-; AVX512VBMI-NEXT: cmovneq %rax, %rbp
-; AVX512VBMI-NEXT: movq 40(%rsi), %r10
-; AVX512VBMI-NEXT: cmovneq %r9, %r10
-; AVX512VBMI-NEXT: movq 32(%rsi), %rsi
-; AVX512VBMI-NEXT: cmovneq %r11, %rsi
-; AVX512VBMI-NEXT: cmovneq %rbx, %r8
-; AVX512VBMI-NEXT: cmovneq %r14, %rax
-; AVX512VBMI-NEXT: cmovneq %r15, %r11
-; AVX512VBMI-NEXT: cmovneq 16(%rdx), %r14
-; AVX512VBMI-NEXT: cmovneq (%rdx), %r15
-; AVX512VBMI-NEXT: cmovneq %r12, %r9
-; AVX512VBMI-NEXT: cmovneq 8(%rdx), %r12
-; AVX512VBMI-NEXT: cmovneq 24(%rdx), %rbx
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: cmoveq %rbx, %r12
-; AVX512VBMI-NEXT: cmoveq %r14, %r15
-; AVX512VBMI-NEXT: cmoveq %r11, %r14
-; AVX512VBMI-NEXT: cmoveq %r9, %rbx
-; AVX512VBMI-NEXT: cmoveq %rax, %r11
-; AVX512VBMI-NEXT: cmoveq %r8, %r9
-; AVX512VBMI-NEXT: cmoveq %rsi, %rax
-; AVX512VBMI-NEXT: cmoveq %r10, %r8
-; AVX512VBMI-NEXT: cmoveq %rbp, %rsi
-; AVX512VBMI-NEXT: cmoveq %r13, %r10
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: cmovneq %rsi, %r10
-; AVX512VBMI-NEXT: cmovneq %r8, %rsi
-; AVX512VBMI-NEXT: cmovneq %rax, %r8
-; AVX512VBMI-NEXT: cmovneq %r9, %rax
-; AVX512VBMI-NEXT: cmovneq %r11, %r9
-; AVX512VBMI-NEXT: cmovneq %rbx, %r11
-; AVX512VBMI-NEXT: cmovneq %r14, %rbx
-; AVX512VBMI-NEXT: cmoveq %r12, %r15
-; AVX512VBMI-NEXT: cmovneq %r12, %r14
-; AVX512VBMI-NEXT: movq %r14, %rdx
-; AVX512VBMI-NEXT: shldq %cl, %r15, %rdx
-; AVX512VBMI-NEXT: movq %rbx, %r15
-; AVX512VBMI-NEXT: shldq %cl, %r14, %r15
-; AVX512VBMI-NEXT: movq %r11, %r14
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %r14
-; AVX512VBMI-NEXT: movq %r9, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rbx
-; AVX512VBMI-NEXT: movq %rax, %r11
-; AVX512VBMI-NEXT: shldq %cl, %r9, %r11
-; AVX512VBMI-NEXT: movq %r8, %r9
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r9
+; AVX512VBMI-NEXT: subq $136, %rsp
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
+; AVX512VBMI-NEXT: vmovdqu (%rdx), %ymm2
+; AVX512VBMI-NEXT: vmovups 32(%rdx), %ymm3
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq %rsi, %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r8, %rdi
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %rsi, %r10
-; AVX512VBMI-NEXT: movq %r10, 56(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 48(%rax)
-; AVX512VBMI-NEXT: movq %r9, 40(%rax)
-; AVX512VBMI-NEXT: movq %r11, 32(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 24(%rax)
-; AVX512VBMI-NEXT: movq %r14, 16(%rax)
-; AVX512VBMI-NEXT: movq %r15, 8(%rax)
-; AVX512VBMI-NEXT: movq %rdx, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
-; AVX512VBMI-NEXT: popq %rbp
+; AVX512VBMI-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; AVX512VBMI-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm2, (%rsp)
+; AVX512VBMI-NEXT: movl %ecx, %edx
+; AVX512VBMI-NEXT: shrl $3, %edx
+; AVX512VBMI-NEXT: andl $56, %edx
+; AVX512VBMI-NEXT: leaq -{{[0-9]+}}(%rsp), %rsi
+; AVX512VBMI-NEXT: subq %rdx, %rsi
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm2[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm4
+; AVX512VBMI-NEXT: vpshldvq %zmm4, %zmm3, %zmm1
+; AVX512VBMI-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512VBMI-NEXT: movq %rcx, %rdx
+; AVX512VBMI-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm3
+; AVX512VBMI-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: vmovdqu64 (%rsp,%rdx,8), %zmm4
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm4[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm3, %zmm2, %zmm4
+; AVX512VBMI-NEXT: xorl %edx, %edx
+; AVX512VBMI-NEXT: negq %rcx
+; AVX512VBMI-NEXT: sbbl %edx, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vporq %zmm4, %zmm1, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VBMI-NEXT: addq $136, %rsp
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
%a1 = load i512, ptr %p1
@@ -2264,242 +2159,162 @@ define i512 @fshr_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshr_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq (%rsi), %r8
-; AVX512F-NEXT: movq 8(%rsi), %r9
-; AVX512F-NEXT: movq 16(%rsi), %r10
-; AVX512F-NEXT: movq 24(%rsi), %r11
-; AVX512F-NEXT: movq 32(%rdx), %r13
-; AVX512F-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT: movq 56(%rsi), %rax
-; AVX512F-NEXT: cmoveq %r11, %rax
-; AVX512F-NEXT: movq 40(%rdx), %rbx
-; AVX512F-NEXT: movq 48(%rsi), %rbp
-; AVX512F-NEXT: cmoveq %r10, %rbp
-; AVX512F-NEXT: movq 48(%rdx), %r14
-; AVX512F-NEXT: movq 40(%rsi), %r12
-; AVX512F-NEXT: cmoveq %r9, %r12
-; AVX512F-NEXT: movq 56(%rdx), %r15
-; AVX512F-NEXT: movq 32(%rsi), %rsi
-; AVX512F-NEXT: cmoveq %r8, %rsi
-; AVX512F-NEXT: cmoveq %r15, %r11
-; AVX512F-NEXT: cmoveq %r14, %r10
-; AVX512F-NEXT: cmoveq %rbx, %r9
-; AVX512F-NEXT: cmoveq %r13, %r8
-; AVX512F-NEXT: cmoveq 24(%rdx), %r15
-; AVX512F-NEXT: cmoveq 8(%rdx), %rbx
-; AVX512F-NEXT: cmoveq (%rdx), %r13
-; AVX512F-NEXT: cmoveq 16(%rdx), %r14
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: cmovneq %r14, %r13
-; AVX512F-NEXT: cmovneq %r15, %rbx
-; AVX512F-NEXT: cmovneq %r8, %r14
-; AVX512F-NEXT: cmovneq %r9, %r15
-; AVX512F-NEXT: cmovneq %r10, %r8
-; AVX512F-NEXT: cmovneq %r11, %r9
-; AVX512F-NEXT: cmovneq %rsi, %r10
-; AVX512F-NEXT: cmovneq %r12, %r11
-; AVX512F-NEXT: cmovneq %rbp, %rsi
-; AVX512F-NEXT: cmovneq %rax, %r12
+; AVX512F-NEXT: subq $136, %rsp
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: cmoveq %rsi, %r12
-; AVX512F-NEXT: cmoveq %r11, %rsi
-; AVX512F-NEXT: cmoveq %r10, %r11
-; AVX512F-NEXT: cmoveq %r9, %r10
-; AVX512F-NEXT: cmoveq %r8, %r9
-; AVX512F-NEXT: cmoveq %r15, %r8
-; AVX512F-NEXT: cmoveq %r14, %r15
-; AVX512F-NEXT: cmoveq %rbx, %r14
-; AVX512F-NEXT: cmoveq %r13, %rbx
-; AVX512F-NEXT: shrdq %cl, %r14, %rbx
-; AVX512F-NEXT: shrdq %cl, %r15, %r14
-; AVX512F-NEXT: shrdq %cl, %r8, %r15
-; AVX512F-NEXT: shrdq %cl, %r9, %r8
-; AVX512F-NEXT: shrdq %cl, %r10, %r9
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: shrdq %cl, %rsi, %r11
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shrdq %cl, %r12, %rsi
-; AVX512F-NEXT: movq %rsi, 56(%rdi)
-; AVX512F-NEXT: movq %r11, 48(%rdi)
-; AVX512F-NEXT: movq %r10, 40(%rdi)
-; AVX512F-NEXT: movq %r9, 32(%rdi)
-; AVX512F-NEXT: movq %r8, 24(%rdi)
-; AVX512F-NEXT: movq %r15, 16(%rdi)
-; AVX512F-NEXT: movq %r14, 8(%rdi)
-; AVX512F-NEXT: movq %rbx, (%rdi)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vmovups (%rsi), %zmm0
+; AVX512F-NEXT: vmovdqu64 (%rdx), %zmm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp)
+; AVX512F-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %ecx, %edx
+; AVX512F-NEXT: andl $63, %edx
+; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpandn %xmm3, %xmm0, %xmm4
+; AVX512F-NEXT: movl %ecx, %edx
+; AVX512F-NEXT: shrl $3, %edx
+; AVX512F-NEXT: andl $56, %edx
+; AVX512F-NEXT: vmovdqu64 -128(%rsp,%rdx), %zmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm6 = zmm5[1,2,3,4,5,6,7],zmm2[0]
+; AVX512F-NEXT: vpaddq %zmm6, %zmm6, %zmm6
+; AVX512F-NEXT: vpsllq %xmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpsrlq %xmm0, %zmm5, %zmm0
+; AVX512F-NEXT: vporq %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512F-NEXT: movq %rcx, %rdx
+; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512F-NEXT: movl %edx, %esi
+; AVX512F-NEXT: andl $63, %esi
+; AVX512F-NEXT: vmovq %rsi, %xmm4
+; AVX512F-NEXT: vpbroadcastq %xmm4, %xmm4
+; AVX512F-NEXT: vpandn %xmm3, %xmm4, %xmm3
+; AVX512F-NEXT: shrl $3, %edx
+; AVX512F-NEXT: andl $-8, %edx
+; AVX512F-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512F-NEXT: subq %rdx, %rsi
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm5[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq %xmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq %xmm4, %zmm5, %zmm3
+; AVX512F-NEXT: xorl %edx, %edx
+; AVX512F-NEXT: negq %rcx
+; AVX512F-NEXT: sbbl %edx, %edx
+; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2
+; AVX512F-NEXT: kmovw %edx, %k1
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: addq $136, %rsp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshr_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq (%rsi), %r8
-; AVX512VL-NEXT: movq 8(%rsi), %r10
-; AVX512VL-NEXT: movq 16(%rsi), %rbx
-; AVX512VL-NEXT: movq 24(%rsi), %r15
-; AVX512VL-NEXT: movq 32(%rdx), %rax
-; AVX512VL-NEXT: movq 40(%rdx), %r9
-; AVX512VL-NEXT: movq 48(%rdx), %r11
-; AVX512VL-NEXT: movq 56(%rdx), %r14
-; AVX512VL-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT: movq 56(%rsi), %r13
-; AVX512VL-NEXT: cmoveq %r15, %r13
-; AVX512VL-NEXT: movq 48(%rsi), %rbp
-; AVX512VL-NEXT: cmoveq %rbx, %rbp
-; AVX512VL-NEXT: movq 40(%rsi), %r12
-; AVX512VL-NEXT: cmoveq %r10, %r12
-; AVX512VL-NEXT: movq 32(%rsi), %rsi
-; AVX512VL-NEXT: cmoveq %r8, %rsi
-; AVX512VL-NEXT: cmoveq %r14, %r15
-; AVX512VL-NEXT: cmoveq %r11, %rbx
-; AVX512VL-NEXT: cmoveq %r9, %r10
-; AVX512VL-NEXT: cmoveq 24(%rdx), %r14
-; AVX512VL-NEXT: cmoveq 8(%rdx), %r9
-; AVX512VL-NEXT: cmoveq %rax, %r8
-; AVX512VL-NEXT: cmoveq (%rdx), %rax
-; AVX512VL-NEXT: cmoveq 16(%rdx), %r11
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: cmovneq %r11, %rax
-; AVX512VL-NEXT: cmovneq %r14, %r9
-; AVX512VL-NEXT: cmovneq %r8, %r11
-; AVX512VL-NEXT: cmovneq %r10, %r14
-; AVX512VL-NEXT: cmovneq %rbx, %r8
-; AVX512VL-NEXT: cmovneq %r15, %r10
-; AVX512VL-NEXT: cmovneq %rsi, %rbx
-; AVX512VL-NEXT: cmovneq %r12, %r15
-; AVX512VL-NEXT: cmovneq %rbp, %rsi
-; AVX512VL-NEXT: cmovneq %r13, %r12
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: cmoveq %rsi, %r12
-; AVX512VL-NEXT: cmoveq %r15, %rsi
-; AVX512VL-NEXT: cmoveq %rbx, %r15
-; AVX512VL-NEXT: cmoveq %r10, %rbx
-; AVX512VL-NEXT: cmoveq %r8, %r10
-; AVX512VL-NEXT: cmoveq %r14, %r8
-; AVX512VL-NEXT: cmoveq %r11, %r14
-; AVX512VL-NEXT: cmoveq %r9, %r11
-; AVX512VL-NEXT: cmoveq %rax, %r9
-; AVX512VL-NEXT: shrdq %cl, %r11, %r9
-; AVX512VL-NEXT: shrdq %cl, %r14, %r11
-; AVX512VL-NEXT: shrdq %cl, %r8, %r14
-; AVX512VL-NEXT: shrdq %cl, %r10, %r8
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r10
-; AVX512VL-NEXT: shrdq %cl, %r15, %rbx
+; AVX512VL-NEXT: subq $136, %rsp
+; AVX512VL-NEXT: vmovups (%rsi), %ymm1
+; AVX512VL-NEXT: vmovups 32(%rsi), %ymm2
+; AVX512VL-NEXT: vmovdqu64 (%rdx), %zmm0
+; AVX512VL-NEXT: vmovups 32(%rdx), %ymm3
+; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, (%rsp)
+; AVX512VL-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shrdq %cl, %rsi, %r15
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shrdq %cl, %r12, %rsi
-; AVX512VL-NEXT: movq %rsi, 56(%rdi)
-; AVX512VL-NEXT: movq %r15, 48(%rdi)
-; AVX512VL-NEXT: movq %rbx, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %r8, 24(%rdi)
-; AVX512VL-NEXT: movq %r14, 16(%rdi)
-; AVX512VL-NEXT: movq %r11, 8(%rdi)
-; AVX512VL-NEXT: movq %r9, (%rdi)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: movl %ecx, %edx
+; AVX512VL-NEXT: andl $63, %edx
+; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: movl %ecx, %edx
+; AVX512VL-NEXT: shrl $3, %edx
+; AVX512VL-NEXT: andl $56, %edx
+; AVX512VL-NEXT: vmovdqu64 -128(%rsp,%rdx), %zmm4
+; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: valignq {{.*#+}} zmm6 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT: vpaddq %zmm6, %zmm6, %zmm6
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm6, %zmm3
+; AVX512VL-NEXT: vpsrlq %xmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512VL-NEXT: movq %rcx, %rdx
+; AVX512VL-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512VL-NEXT: movl %edx, %esi
+; AVX512VL-NEXT: andl $63, %esi
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm3
+; AVX512VL-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT: shrl $3, %edx
+; AVX512VL-NEXT: andl $-8, %edx
+; AVX512VL-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512VL-NEXT: subq %rdx, %rsi
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm4
+; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm5[7],zmm4[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm5, %zmm5
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm5, %zmm2
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vporq %zmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: negq %rcx
+; AVX512VL-NEXT: sbbl %edx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT: addq $136, %rsp
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshr_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rbp
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq (%rsi), %r8
-; AVX512VBMI-NEXT: movq 8(%rsi), %r10
-; AVX512VBMI-NEXT: movq 16(%rsi), %rbx
-; AVX512VBMI-NEXT: movq 24(%rsi), %r15
-; AVX512VBMI-NEXT: movq 32(%rdx), %rax
-; AVX512VBMI-NEXT: movq 40(%rdx), %r9
-; AVX512VBMI-NEXT: movq 48(%rdx), %r11
-; AVX512VBMI-NEXT: movq 56(%rdx), %r14
-; AVX512VBMI-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT: movq 56(%rsi), %r13
-; AVX512VBMI-NEXT: cmoveq %r15, %r13
-; AVX512VBMI-NEXT: movq 48(%rsi), %rbp
-; AVX512VBMI-NEXT: cmoveq %rbx, %rbp
-; AVX512VBMI-NEXT: movq 40(%rsi), %r12
-; AVX512VBMI-NEXT: cmoveq %r10, %r12
-; AVX512VBMI-NEXT: movq 32(%rsi), %rsi
-; AVX512VBMI-NEXT: cmoveq %r8, %rsi
-; AVX512VBMI-NEXT: cmoveq %r14, %r15
-; AVX512VBMI-NEXT: cmoveq %r11, %rbx
-; AVX512VBMI-NEXT: cmoveq %r9, %r10
-; AVX512VBMI-NEXT: cmoveq 24(%rdx), %r14
-; AVX512VBMI-NEXT: cmoveq 8(%rdx), %r9
-; AVX512VBMI-NEXT: cmoveq %rax, %r8
-; AVX512VBMI-NEXT: cmoveq (%rdx), %rax
-; AVX512VBMI-NEXT: cmoveq 16(%rdx), %r11
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: cmovneq %r11, %rax
-; AVX512VBMI-NEXT: cmovneq %r14, %r9
-; AVX512VBMI-NEXT: cmovneq %r8, %r11
-; AVX512VBMI-NEXT: cmovneq %r10, %r14
-; AVX512VBMI-NEXT: cmovneq %rbx, %r8
-; AVX512VBMI-NEXT: cmovneq %r15, %r10
-; AVX512VBMI-NEXT: cmovneq %rsi, %rbx
-; AVX512VBMI-NEXT: cmovneq %r12, %r15
-; AVX512VBMI-NEXT: cmovneq %rbp, %rsi
-; AVX512VBMI-NEXT: cmovneq %r13, %r12
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: cmoveq %rsi, %r12
-; AVX512VBMI-NEXT: cmoveq %r15, %rsi
-; AVX512VBMI-NEXT: cmoveq %rbx, %r15
-; AVX512VBMI-NEXT: cmoveq %r10, %rbx
-; AVX512VBMI-NEXT: cmoveq %r8, %r10
-; AVX512VBMI-NEXT: cmoveq %r14, %r8
-; AVX512VBMI-NEXT: cmoveq %r11, %r14
-; AVX512VBMI-NEXT: cmoveq %r9, %r11
-; AVX512VBMI-NEXT: cmoveq %rax, %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r8
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r15, %rbx
+; AVX512VBMI-NEXT: subq $136, %rsp
+; AVX512VBMI-NEXT: vmovups (%rsi), %ymm1
+; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm2
+; AVX512VBMI-NEXT: vmovdqu64 (%rdx), %zmm0
+; AVX512VBMI-NEXT: vmovups 32(%rdx), %ymm3
+; AVX512VBMI-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI-NEXT: vmovdqu %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm4, (%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r15
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shrdq %cl, %r12, %rsi
-; AVX512VBMI-NEXT: movq %rsi, 56(%rdi)
-; AVX512VBMI-NEXT: movq %r15, 48(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT: movq %r14, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r9, (%rdi)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
-; AVX512VBMI-NEXT: popq %rbp
+; AVX512VBMI-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movl %ecx, %edx
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
+; AVX512VBMI-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512VBMI-NEXT: movq %rcx, %rsi
+; AVX512VBMI-NEXT: xorq $511, %rsi # imm = 0x1FF
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm2
+; AVX512VBMI-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi
+; AVX512VBMI-NEXT: shrl $3, %esi
+; AVX512VBMI-NEXT: andl $-8, %esi
+; AVX512VBMI-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512VBMI-NEXT: subq %rsi, %rdi
+; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm3
+; AVX512VBMI-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm5 = zmm4[7],zmm3[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm2, %zmm5, %zmm3
+; AVX512VBMI-NEXT: shrl $3, %edx
+; AVX512VBMI-NEXT: andl $56, %edx
+; AVX512VBMI-NEXT: vmovdqu64 -128(%rsp,%rdx), %zmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm2[1,2,3,4,5,6,7],zmm4[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm4, %zmm2
+; AVX512VBMI-NEXT: xorl %edx, %edx
+; AVX512VBMI-NEXT: negq %rcx
+; AVX512VBMI-NEXT: sbbl %edx, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vporq %zmm2, %zmm3, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rax)
+; AVX512VBMI-NEXT: addq $136, %rsp
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
%a1 = load i512, ptr %p1
@@ -3599,297 +3414,121 @@ define i512 @fshl_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshl_i512_vector:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: vmovq %xmm0, %r10
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512F-NEXT: vmovq %xmm2, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm0, %r8
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512F-NEXT: vpextrq $1, %xmm0, %r9
-; AVX512F-NEXT: vmovq %xmm2, %r13
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rbx
-; AVX512F-NEXT: vmovq %xmm0, %r11
-; AVX512F-NEXT: vpextrq $1, %xmm1, %r12
-; AVX512F-NEXT: vmovq %xmm2, %r14
-; AVX512F-NEXT: vmovq %xmm1, %r15
-; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm0
-; AVX512F-NEXT: vmovq %xmm0, %rbp
-; AVX512F-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT: cmovneq %r10, %rdx
-; AVX512F-NEXT: cmovneq %rbp, %r10
-; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512F-NEXT: cmoveq %rbp, %r15
-; AVX512F-NEXT: vmovq %xmm1, %rbp
-; AVX512F-NEXT: cmovneq %r11, %r13
-; AVX512F-NEXT: cmovneq %rbp, %r11
-; AVX512F-NEXT: cmoveq %rbp, %r14
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rbp
-; AVX512F-NEXT: cmovneq %r8, %rsi
-; AVX512F-NEXT: cmovneq %rbp, %r8
-; AVX512F-NEXT: cmoveq %rbp, %r12
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rbp
-; AVX512F-NEXT: cmovneq %r9, %rax
-; AVX512F-NEXT: cmovneq %rbp, %r9
-; AVX512F-NEXT: cmoveq %rbp, %rbx
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: cmoveq %r14, %r15
-; AVX512F-NEXT: cmoveq %r10, %r14
-; AVX512F-NEXT: cmoveq %r11, %r10
-; AVX512F-NEXT: cmoveq %rdx, %r11
-; AVX512F-NEXT: cmoveq %r13, %rdx
-; AVX512F-NEXT: cmoveq %rbx, %r12
-; AVX512F-NEXT: cmoveq %r8, %rbx
-; AVX512F-NEXT: cmoveq %r9, %r8
-; AVX512F-NEXT: cmoveq %rsi, %r9
-; AVX512F-NEXT: cmoveq %rax, %rsi
+; AVX512F-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rsi, %xmm2
+; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm4
+; AVX512F-NEXT: movl %esi, %eax
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %eax, %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpexpandq %zmm0, %zmm5 {%k1} {z}
+; AVX512F-NEXT: vpsllq %xmm4, %zmm5, %zmm4
+; AVX512F-NEXT: vpandn %xmm2, %xmm3, %xmm3
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: valignq {{.*#+}} zmm5 = zmm6[7],zmm5[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm5, %zmm5
+; AVX512F-NEXT: vpsrlq %xmm3, %zmm5, %zmm3
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: cmovneq %rdx, %rsi
-; AVX512F-NEXT: cmovneq %r9, %rdx
-; AVX512F-NEXT: cmovneq %r11, %r9
-; AVX512F-NEXT: cmovneq %r8, %r11
-; AVX512F-NEXT: cmovneq %r10, %r8
-; AVX512F-NEXT: cmovneq %rbx, %r10
-; AVX512F-NEXT: cmovneq %r14, %rbx
-; AVX512F-NEXT: cmoveq %r12, %r15
-; AVX512F-NEXT: cmovneq %r12, %r14
-; AVX512F-NEXT: movq %r14, %rdi
-; AVX512F-NEXT: shldq %cl, %r15, %rdi
-; AVX512F-NEXT: movq %rbx, %r15
-; AVX512F-NEXT: shldq %cl, %r14, %r15
-; AVX512F-NEXT: movq %r10, %r14
-; AVX512F-NEXT: shldq %cl, %rbx, %r14
-; AVX512F-NEXT: movq %r8, %rbx
-; AVX512F-NEXT: shldq %cl, %r10, %rbx
-; AVX512F-NEXT: movq %r11, %r10
-; AVX512F-NEXT: shldq %cl, %r8, %r10
-; AVX512F-NEXT: movq %r9, %r8
-; AVX512F-NEXT: shldq %cl, %r11, %r8
-; AVX512F-NEXT: movq %rdx, %r11
-; AVX512F-NEXT: shldq %cl, %r9, %r11
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq %rsi, 56(%rax)
-; AVX512F-NEXT: movq %r11, 48(%rax)
-; AVX512F-NEXT: movq %r8, 40(%rax)
-; AVX512F-NEXT: movq %r10, 32(%rax)
-; AVX512F-NEXT: movq %rbx, 24(%rax)
-; AVX512F-NEXT: movq %r14, 16(%rax)
-; AVX512F-NEXT: movq %r15, 8(%rax)
-; AVX512F-NEXT: movq %rdi, (%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT: movq %rsi, %rdx
+; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: vpbroadcastq %xmm4, %xmm4
+; AVX512F-NEXT: vpand %xmm2, %xmm4, %xmm5
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm5, %zmm1, %zmm5
+; AVX512F-NEXT: vpandn %xmm2, %xmm4, %xmm2
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm6[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpsllq %xmm2, %zmm1, %zmm1
+; AVX512F-NEXT: xorl %ecx, %ecx
+; AVX512F-NEXT: negq %rsi
+; AVX512F-NEXT: sbbl %ecx, %ecx
+; AVX512F-NEXT: vporq %zmm5, %zmm1, %zmm1
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm0 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshl_i512_vector:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: vmovq %xmm0, %r9
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VL-NEXT: vmovq %xmm2, %rdx
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VL-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VL-NEXT: vmovq %xmm2, %r13
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpextrq $1, %xmm2, %r12
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %r8
-; AVX512VL-NEXT: vmovq %xmm0, %r10
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %r11
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %r15
-; AVX512VL-NEXT: vmovq %xmm0, %rbx
-; AVX512VL-NEXT: vmovq %xmm1, %r14
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm1, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %rbp
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512VL-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT: cmovneq %r9, %rdx
-; AVX512VL-NEXT: cmovneq %rbp, %r9
-; AVX512VL-NEXT: cmoveq %rbp, %r14
-; AVX512VL-NEXT: vmovq %xmm1, %rbp
-; AVX512VL-NEXT: cmovneq %r10, %r13
-; AVX512VL-NEXT: cmovneq %rbp, %r10
-; AVX512VL-NEXT: cmoveq %rbp, %rbx
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rbp
-; AVX512VL-NEXT: cmovneq %rax, %rsi
-; AVX512VL-NEXT: cmovneq %rbp, %rax
-; AVX512VL-NEXT: cmoveq %rbp, %r15
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rbp
-; AVX512VL-NEXT: cmovneq %r8, %r12
-; AVX512VL-NEXT: cmovneq %rbp, %r8
-; AVX512VL-NEXT: cmoveq %rbp, %r11
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: cmoveq %rbx, %r14
-; AVX512VL-NEXT: cmoveq %r9, %rbx
-; AVX512VL-NEXT: cmoveq %r10, %r9
-; AVX512VL-NEXT: cmoveq %rdx, %r10
-; AVX512VL-NEXT: cmoveq %r13, %rdx
-; AVX512VL-NEXT: cmoveq %r11, %r15
-; AVX512VL-NEXT: cmoveq %rax, %r11
-; AVX512VL-NEXT: cmoveq %r8, %rax
-; AVX512VL-NEXT: cmoveq %rsi, %r8
-; AVX512VL-NEXT: cmoveq %r12, %rsi
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: cmovneq %rdx, %rsi
-; AVX512VL-NEXT: cmovneq %r8, %rdx
-; AVX512VL-NEXT: cmovneq %r10, %r8
-; AVX512VL-NEXT: cmovneq %rax, %r10
-; AVX512VL-NEXT: cmovneq %r9, %rax
-; AVX512VL-NEXT: cmovneq %r11, %r9
-; AVX512VL-NEXT: cmovneq %rbx, %r11
-; AVX512VL-NEXT: cmoveq %r15, %r14
-; AVX512VL-NEXT: cmovneq %r15, %rbx
-; AVX512VL-NEXT: movq %rbx, %r15
-; AVX512VL-NEXT: shldq %cl, %r14, %r15
-; AVX512VL-NEXT: movq %r11, %r14
-; AVX512VL-NEXT: shldq %cl, %rbx, %r14
-; AVX512VL-NEXT: movq %r9, %rbx
-; AVX512VL-NEXT: shldq %cl, %r11, %rbx
-; AVX512VL-NEXT: movq %rax, %r11
-; AVX512VL-NEXT: shldq %cl, %r9, %r11
-; AVX512VL-NEXT: movq %r10, %r9
-; AVX512VL-NEXT: shldq %cl, %rax, %r9
-; AVX512VL-NEXT: movq %r8, %r12
-; AVX512VL-NEXT: shldq %cl, %r10, %r12
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %rdx, %rdi
-; AVX512VL-NEXT: shldq %cl, %r8, %rdi
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq %rsi, 56(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r12, 40(%rax)
-; AVX512VL-NEXT: movq %r9, 32(%rax)
-; AVX512VL-NEXT: movq %r11, 24(%rax)
-; AVX512VL-NEXT: movq %rbx, 16(%rax)
-; AVX512VL-NEXT: movq %r14, 8(%rax)
-; AVX512VL-NEXT: movq %r15, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm2
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpexpandq %zmm0, %zmm5 {%k1} {z}
+; AVX512VL-NEXT: vpsllq %xmm4, %zmm5, %zmm4
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm6[7],zmm5[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm5, %zmm5
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm5, %zmm2
+; AVX512VL-NEXT: vporq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT: movq %rsi, %rcx
+; AVX512VL-NEXT: xorq $511, %rcx # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm4
+; AVX512VL-NEXT: vpand %xmm3, %xmm4, %xmm5
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm5, %zmm1, %zmm5
+; AVX512VL-NEXT: vpandn %xmm3, %xmm4, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm6[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vporq %zmm5, %zmm1, %zmm1
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: negq %rsi
+; AVX512VL-NEXT: sbbl %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshl_i512_vector:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rbp
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: vmovq %xmm0, %r9
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vmovq %xmm2, %rdx
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512VBMI-NEXT: vpextrq $1, %xmm2, %rsi
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vmovq %xmm2, %r13
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VBMI-NEXT: vpextrq $1, %xmm2, %r12
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %r8
-; AVX512VBMI-NEXT: vmovq %xmm0, %r10
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %r11
-; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %r15
-; AVX512VBMI-NEXT: vmovq %xmm0, %rbx
-; AVX512VBMI-NEXT: vmovq %xmm1, %r14
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm1, %xmm0
-; AVX512VBMI-NEXT: vmovq %xmm0, %rbp
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512VBMI-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT: cmovneq %r9, %rdx
-; AVX512VBMI-NEXT: cmovneq %rbp, %r9
-; AVX512VBMI-NEXT: cmoveq %rbp, %r14
-; AVX512VBMI-NEXT: vmovq %xmm1, %rbp
-; AVX512VBMI-NEXT: cmovneq %r10, %r13
-; AVX512VBMI-NEXT: cmovneq %rbp, %r10
-; AVX512VBMI-NEXT: cmoveq %rbp, %rbx
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rbp
-; AVX512VBMI-NEXT: cmovneq %rax, %rsi
-; AVX512VBMI-NEXT: cmovneq %rbp, %rax
-; AVX512VBMI-NEXT: cmoveq %rbp, %r15
-; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rbp
-; AVX512VBMI-NEXT: cmovneq %r8, %r12
-; AVX512VBMI-NEXT: cmovneq %rbp, %r8
-; AVX512VBMI-NEXT: cmoveq %rbp, %r11
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: cmoveq %rbx, %r14
-; AVX512VBMI-NEXT: cmoveq %r9, %rbx
-; AVX512VBMI-NEXT: cmoveq %r10, %r9
-; AVX512VBMI-NEXT: cmoveq %rdx, %r10
-; AVX512VBMI-NEXT: cmoveq %r13, %rdx
-; AVX512VBMI-NEXT: cmoveq %r11, %r15
-; AVX512VBMI-NEXT: cmoveq %rax, %r11
-; AVX512VBMI-NEXT: cmoveq %r8, %rax
-; AVX512VBMI-NEXT: cmoveq %rsi, %r8
-; AVX512VBMI-NEXT: cmoveq %r12, %rsi
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: cmovneq %rdx, %rsi
-; AVX512VBMI-NEXT: cmovneq %r8, %rdx
-; AVX512VBMI-NEXT: cmovneq %r10, %r8
-; AVX512VBMI-NEXT: cmovneq %rax, %r10
-; AVX512VBMI-NEXT: cmovneq %r9, %rax
-; AVX512VBMI-NEXT: cmovneq %r11, %r9
-; AVX512VBMI-NEXT: cmovneq %rbx, %r11
-; AVX512VBMI-NEXT: cmoveq %r15, %r14
-; AVX512VBMI-NEXT: cmovneq %r15, %rbx
-; AVX512VBMI-NEXT: movq %rbx, %r15
-; AVX512VBMI-NEXT: shldq %cl, %r14, %r15
-; AVX512VBMI-NEXT: movq %r11, %r14
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %r14
-; AVX512VBMI-NEXT: movq %r9, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rbx
-; AVX512VBMI-NEXT: movq %rax, %r11
-; AVX512VBMI-NEXT: shldq %cl, %r9, %r11
-; AVX512VBMI-NEXT: movq %r10, %r9
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r9
-; AVX512VBMI-NEXT: movq %r8, %r12
-; AVX512VBMI-NEXT: shldq %cl, %r10, %r12
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq %rdx, %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r8, %rdi
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq %rsi, 56(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 48(%rax)
-; AVX512VBMI-NEXT: movq %r12, 40(%rax)
-; AVX512VBMI-NEXT: movq %r9, 32(%rax)
-; AVX512VBMI-NEXT: movq %r11, 24(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 16(%rax)
-; AVX512VBMI-NEXT: movq %r14, 8(%rax)
-; AVX512VBMI-NEXT: movq %r15, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
-; AVX512VBMI-NEXT: popq %rbp
+; AVX512VBMI-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm3[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm5
+; AVX512VBMI-NEXT: vpshldvq %zmm5, %zmm4, %zmm2
+; AVX512VBMI-NEXT: movq %rsi, %rcx
+; AVX512VBMI-NEXT: xorq $511, %rcx # imm = 0x1FF
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm4
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm4, %zmm3, %zmm1
+; AVX512VBMI-NEXT: xorl %ecx, %ecx
+; AVX512VBMI-NEXT: negq %rsi
+; AVX512VBMI-NEXT: sbbl %ecx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = bitcast <8 x i64> %v0 to i512
@@ -4169,276 +3808,121 @@ define i512 @fshr_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshr_i512_vector:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: vmovq %xmm0, %rdx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512F-NEXT: vmovq %xmm2, %r8
-; AVX512F-NEXT: vpextrq $1, %xmm2, %r9
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT: vmovq %xmm2, %r13
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512F-NEXT: vmovq %xmm0, %r10
-; AVX512F-NEXT: vpextrq $1, %xmm0, %r11
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm0, %r14
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rbx
-; AVX512F-NEXT: vmovq %xmm1, %r15
-; AVX512F-NEXT: vextracti32x4 $2, %zmm1, %xmm2
-; AVX512F-NEXT: vpextrq $1, %xmm2, %r12
-; AVX512F-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rbp
-; AVX512F-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT: cmoveq %r11, %rax
-; AVX512F-NEXT: cmoveq %rsi, %r9
-; AVX512F-NEXT: cmoveq %rbp, %r11
-; AVX512F-NEXT: cmoveq %r12, %rsi
-; AVX512F-NEXT: cmovneq %r12, %rbx
-; AVX512F-NEXT: cmovneq %rbp, %r14
-; AVX512F-NEXT: vmovq %xmm2, %r12
-; AVX512F-NEXT: vmovq %xmm1, %rbp
-; AVX512F-NEXT: cmoveq %r10, %r13
-; AVX512F-NEXT: cmoveq %rdx, %r8
-; AVX512F-NEXT: cmoveq %rbp, %r10
-; AVX512F-NEXT: cmoveq %r12, %rdx
-; AVX512F-NEXT: cmovneq %r12, %r15
-; AVX512F-NEXT: vmovq %xmm0, %r12
-; AVX512F-NEXT: cmovneq %rbp, %r12
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: cmovneq %r12, %r15
-; AVX512F-NEXT: cmovneq %rdx, %r12
-; AVX512F-NEXT: cmovneq %r10, %rdx
-; AVX512F-NEXT: cmovneq %r8, %r10
-; AVX512F-NEXT: cmovneq %r13, %r8
-; AVX512F-NEXT: cmovneq %r14, %rbx
-; AVX512F-NEXT: cmovneq %rsi, %r14
-; AVX512F-NEXT: cmovneq %r11, %rsi
-; AVX512F-NEXT: cmovneq %r9, %r11
-; AVX512F-NEXT: cmovneq %rax, %r9
+; AVX512F-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rsi, %xmm2
+; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm4
+; AVX512F-NEXT: movl %esi, %eax
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %eax, %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm5 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm5, %zmm4
+; AVX512F-NEXT: vpandn %xmm2, %xmm3, %xmm3
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: valignq {{.*#+}} zmm5 = zmm5[1,2,3,4,5,6,7],zmm6[0]
+; AVX512F-NEXT: vpaddq %zmm5, %zmm5, %zmm5
+; AVX512F-NEXT: vpsllq %xmm3, %zmm5, %zmm3
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: cmoveq %r8, %r9
-; AVX512F-NEXT: cmoveq %r11, %r8
-; AVX512F-NEXT: cmoveq %r10, %r11
-; AVX512F-NEXT: cmoveq %rsi, %r10
-; AVX512F-NEXT: cmoveq %rdx, %rsi
-; AVX512F-NEXT: cmoveq %r14, %rdx
-; AVX512F-NEXT: cmoveq %r12, %r14
-; AVX512F-NEXT: cmoveq %rbx, %r12
-; AVX512F-NEXT: cmoveq %r15, %rbx
-; AVX512F-NEXT: shrdq %cl, %r12, %rbx
-; AVX512F-NEXT: shrdq %cl, %r14, %r12
-; AVX512F-NEXT: shrdq %cl, %rdx, %r14
-; AVX512F-NEXT: shrdq %cl, %rsi, %rdx
-; AVX512F-NEXT: shrdq %cl, %r10, %rsi
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: shrdq %cl, %r8, %r11
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shrdq %cl, %r9, %r8
-; AVX512F-NEXT: movq %r8, 56(%rdi)
-; AVX512F-NEXT: movq %r11, 48(%rdi)
-; AVX512F-NEXT: movq %r10, 40(%rdi)
-; AVX512F-NEXT: movq %rsi, 32(%rdi)
-; AVX512F-NEXT: movq %rdx, 24(%rdi)
-; AVX512F-NEXT: movq %r14, 16(%rdi)
-; AVX512F-NEXT: movq %r12, 8(%rdi)
-; AVX512F-NEXT: movq %rbx, (%rdi)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vporq %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT: movq %rsi, %rdx
+; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: vpbroadcastq %xmm4, %xmm4
+; AVX512F-NEXT: vpand %xmm2, %xmm4, %xmm5
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpsllq %xmm5, %zmm0, %zmm5
+; AVX512F-NEXT: vpandn %xmm2, %xmm4, %xmm2
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm6[7],zmm0[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm0, %zmm0
+; AVX512F-NEXT: xorl %ecx, %ecx
+; AVX512F-NEXT: negq %rsi
+; AVX512F-NEXT: sbbl %ecx, %ecx
+; AVX512F-NEXT: vporq %zmm0, %zmm5, %zmm0
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshr_i512_vector:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: vmovq %xmm0, %rdx
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VL-NEXT: vmovq %xmm2, %r8
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512VL-NEXT: vpextrq $1, %xmm2, %r9
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VL-NEXT: vmovq %xmm2, %r13
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VL-NEXT: vpextrq $1, %xmm2, %r12
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rbx
-; AVX512VL-NEXT: vmovq %xmm0, %r11
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VL-NEXT: vmovq %xmm0, %r10
-; AVX512VL-NEXT: vmovq %xmm1, %rax
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %r15
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm1, %xmm0
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rbp
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512VL-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT: cmoveq %rsi, %r9
-; AVX512VL-NEXT: cmoveq %rbp, %rsi
-; AVX512VL-NEXT: cmovneq %rbp, %r14
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rbp
-; AVX512VL-NEXT: cmoveq %rbx, %r12
-; AVX512VL-NEXT: cmoveq %rbp, %rbx
-; AVX512VL-NEXT: cmovneq %rbp, %r15
-; AVX512VL-NEXT: vmovq %xmm0, %rbp
-; AVX512VL-NEXT: cmoveq %rdx, %r8
-; AVX512VL-NEXT: cmoveq %rbp, %rdx
-; AVX512VL-NEXT: cmovneq %rbp, %rax
-; AVX512VL-NEXT: vmovq %xmm1, %rbp
-; AVX512VL-NEXT: cmoveq %r11, %r13
-; AVX512VL-NEXT: cmoveq %rbp, %r11
-; AVX512VL-NEXT: cmovneq %rbp, %r10
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: cmovneq %r10, %rax
-; AVX512VL-NEXT: cmovneq %rdx, %r10
-; AVX512VL-NEXT: cmovneq %r11, %rdx
-; AVX512VL-NEXT: cmovneq %r8, %r11
-; AVX512VL-NEXT: cmovneq %r13, %r8
-; AVX512VL-NEXT: cmovneq %r15, %r14
-; AVX512VL-NEXT: cmovneq %rsi, %r15
-; AVX512VL-NEXT: cmovneq %rbx, %rsi
-; AVX512VL-NEXT: cmovneq %r9, %rbx
-; AVX512VL-NEXT: cmovneq %r12, %r9
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: cmoveq %r8, %r9
-; AVX512VL-NEXT: cmoveq %rbx, %r8
-; AVX512VL-NEXT: cmoveq %r11, %rbx
-; AVX512VL-NEXT: cmoveq %rsi, %r11
-; AVX512VL-NEXT: cmoveq %rdx, %rsi
-; AVX512VL-NEXT: cmoveq %r15, %rdx
-; AVX512VL-NEXT: cmoveq %r10, %r15
-; AVX512VL-NEXT: cmoveq %r14, %r10
-; AVX512VL-NEXT: cmoveq %rax, %r14
-; AVX512VL-NEXT: shrdq %cl, %r10, %r14
-; AVX512VL-NEXT: shrdq %cl, %r15, %r10
-; AVX512VL-NEXT: shrdq %cl, %rdx, %r15
-; AVX512VL-NEXT: shrdq %cl, %rsi, %rdx
-; AVX512VL-NEXT: shrdq %cl, %r11, %rsi
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shrdq %cl, %r8, %rbx
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shrdq %cl, %r9, %r8
-; AVX512VL-NEXT: movq %r8, 56(%rdi)
-; AVX512VL-NEXT: movq %rbx, 48(%rdi)
-; AVX512VL-NEXT: movq %r11, 40(%rdi)
-; AVX512VL-NEXT: movq %rsi, 32(%rdi)
-; AVX512VL-NEXT: movq %rdx, 24(%rdi)
-; AVX512VL-NEXT: movq %r15, 16(%rdi)
-; AVX512VL-NEXT: movq %r10, 8(%rdi)
-; AVX512VL-NEXT: movq %r14, (%rdi)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm2
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm5 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm5, %zmm4
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm5[1,2,3,4,5,6,7],zmm6[0]
+; AVX512VL-NEXT: vpaddq %zmm5, %zmm5, %zmm5
+; AVX512VL-NEXT: vpsllq %xmm2, %zmm5, %zmm2
+; AVX512VL-NEXT: vporq %zmm4, %zmm2, %zmm2
+; AVX512VL-NEXT: movq %rsi, %rcx
+; AVX512VL-NEXT: xorq $511, %rcx # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm4
+; AVX512VL-NEXT: vpand %xmm3, %xmm4, %xmm5
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpsllq %xmm5, %zmm0, %zmm5
+; AVX512VL-NEXT: vpandn %xmm3, %xmm4, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm6[7],zmm0[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm0, %zmm0
+; AVX512VL-NEXT: vporq %zmm0, %zmm5, %zmm0
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: negq %rsi
+; AVX512VL-NEXT: sbbl %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vporq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshr_i512_vector:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rbp
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: vmovq %xmm0, %rdx
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vmovq %xmm2, %r8
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512VBMI-NEXT: vpextrq $1, %xmm2, %r9
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vmovq %xmm2, %r13
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm0
-; AVX512VBMI-NEXT: vpextrq $1, %xmm2, %r12
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rbx
-; AVX512VBMI-NEXT: vmovq %xmm0, %r11
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VBMI-NEXT: vmovq %xmm0, %r10
-; AVX512VBMI-NEXT: vmovq %xmm1, %rax
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %r15
-; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %r14
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm1, %xmm0
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rbp
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm1, %xmm1
-; AVX512VBMI-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT: cmoveq %rsi, %r9
-; AVX512VBMI-NEXT: cmoveq %rbp, %rsi
-; AVX512VBMI-NEXT: cmovneq %rbp, %r14
-; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rbp
-; AVX512VBMI-NEXT: cmoveq %rbx, %r12
-; AVX512VBMI-NEXT: cmoveq %rbp, %rbx
-; AVX512VBMI-NEXT: cmovneq %rbp, %r15
-; AVX512VBMI-NEXT: vmovq %xmm0, %rbp
-; AVX512VBMI-NEXT: cmoveq %rdx, %r8
-; AVX512VBMI-NEXT: cmoveq %rbp, %rdx
-; AVX512VBMI-NEXT: cmovneq %rbp, %rax
-; AVX512VBMI-NEXT: vmovq %xmm1, %rbp
-; AVX512VBMI-NEXT: cmoveq %r11, %r13
-; AVX512VBMI-NEXT: cmoveq %rbp, %r11
-; AVX512VBMI-NEXT: cmovneq %rbp, %r10
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: cmovneq %r10, %rax
-; AVX512VBMI-NEXT: cmovneq %rdx, %r10
-; AVX512VBMI-NEXT: cmovneq %r11, %rdx
-; AVX512VBMI-NEXT: cmovneq %r8, %r11
-; AVX512VBMI-NEXT: cmovneq %r13, %r8
-; AVX512VBMI-NEXT: cmovneq %r15, %r14
-; AVX512VBMI-NEXT: cmovneq %rsi, %r15
-; AVX512VBMI-NEXT: cmovneq %rbx, %rsi
-; AVX512VBMI-NEXT: cmovneq %r9, %rbx
-; AVX512VBMI-NEXT: cmovneq %r12, %r9
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: cmoveq %r8, %r9
-; AVX512VBMI-NEXT: cmoveq %rbx, %r8
-; AVX512VBMI-NEXT: cmoveq %r11, %rbx
-; AVX512VBMI-NEXT: cmoveq %rsi, %r11
-; AVX512VBMI-NEXT: cmoveq %rdx, %rsi
-; AVX512VBMI-NEXT: cmoveq %r15, %rdx
-; AVX512VBMI-NEXT: cmoveq %r10, %r15
-; AVX512VBMI-NEXT: cmoveq %r14, %r10
-; AVX512VBMI-NEXT: cmoveq %rax, %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r14
-; AVX512VBMI-NEXT: shrdq %cl, %r15, %r10
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r15
-; AVX512VBMI-NEXT: shrdq %cl, %rsi, %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %rbx
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %r8
-; AVX512VBMI-NEXT: movq %r8, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 40(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 32(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 24(%rdi)
-; AVX512VBMI-NEXT: movq %r15, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r14, (%rdi)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
-; AVX512VBMI-NEXT: popq %rbp
+; AVX512VBMI-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm2[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm5
+; AVX512VBMI-NEXT: vpshrdvq %zmm5, %zmm4, %zmm2
+; AVX512VBMI-NEXT: movq %rsi, %rcx
+; AVX512VBMI-NEXT: xorq $511, %rcx # imm = 0x1FF
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm4
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm3[7],zmm0[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm4, %zmm3, %zmm0
+; AVX512VBMI-NEXT: xorl %ecx, %ecx
+; AVX512VBMI-NEXT: negq %rsi
+; AVX512VBMI-NEXT: sbbl %ecx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vporq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = bitcast <8 x i64> %v0 to i512
@@ -4703,258 +4187,123 @@ define i512 @fshl_rot_i512_vector(<8 x i64> %v0, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshl_rot_i512_vector:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT: vmovq %xmm2, %rsi
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512F-NEXT: vpextrq $1, %xmm2, %r11
-; AVX512F-NEXT: vpextrq $1, %xmm3, %rbx
-; AVX512F-NEXT: vmovq %xmm3, %r14
-; AVX512F-NEXT: vmovq %xmm0, %r10
-; AVX512F-NEXT: vmovq %xmm1, %r12
-; AVX512F-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT: movq %r12, %r15
-; AVX512F-NEXT: cmovneq %r10, %r15
-; AVX512F-NEXT: movq %rbx, %r13
-; AVX512F-NEXT: cmovneq %r11, %r13
-; AVX512F-NEXT: movq %r14, %r8
-; AVX512F-NEXT: cmovneq %rsi, %r8
-; AVX512F-NEXT: movq %rax, %r9
-; AVX512F-NEXT: cmovneq %rdx, %r9
-; AVX512F-NEXT: cmovneq %r12, %r10
-; AVX512F-NEXT: cmovneq %r14, %rsi
-; AVX512F-NEXT: cmovneq %rax, %rdx
-; AVX512F-NEXT: cmovneq %rbx, %r11
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: movq %r9, %r14
-; AVX512F-NEXT: cmovneq %r11, %r14
-; AVX512F-NEXT: cmovneq %rdx, %r11
-; AVX512F-NEXT: movq %r8, %rbx
-; AVX512F-NEXT: cmovneq %r10, %rbx
-; AVX512F-NEXT: cmovneq %rsi, %r10
-; AVX512F-NEXT: cmoveq %r13, %r9
-; AVX512F-NEXT: cmoveq %r15, %r8
-; AVX512F-NEXT: cmovneq %r13, %rdx
+; AVX512F-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpand %xmm1, %xmm2, %xmm3
+; AVX512F-NEXT: movl %esi, %eax
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %eax, %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpexpandq %zmm0, %zmm4 {%k1} {z}
+; AVX512F-NEXT: vpsllq %xmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpandn %xmm1, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm4 = zmm5[7],zmm4[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm4, %zmm4
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm4, %zmm2
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: cmovneq %r15, %rsi
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: movq %rsi, %rdi
-; AVX512F-NEXT: cmovneq %rdx, %rdi
-; AVX512F-NEXT: cmovneq %r8, %rdx
-; AVX512F-NEXT: cmovneq %r9, %r8
-; AVX512F-NEXT: cmovneq %rbx, %r9
-; AVX512F-NEXT: cmovneq %r14, %rbx
-; AVX512F-NEXT: cmovneq %r10, %r14
-; AVX512F-NEXT: cmoveq %r11, %rsi
-; AVX512F-NEXT: cmovneq %r11, %r10
-; AVX512F-NEXT: movq %r10, %r11
-; AVX512F-NEXT: shldq %cl, %rsi, %r11
-; AVX512F-NEXT: movq %r14, %r15
-; AVX512F-NEXT: shldq %cl, %r10, %r15
-; AVX512F-NEXT: movq %rbx, %r10
-; AVX512F-NEXT: shldq %cl, %r14, %r10
-; AVX512F-NEXT: movq %r9, %r14
-; AVX512F-NEXT: shldq %cl, %rbx, %r14
-; AVX512F-NEXT: movq %r8, %rbx
-; AVX512F-NEXT: shldq %cl, %r9, %rbx
-; AVX512F-NEXT: movq %rdx, %r9
-; AVX512F-NEXT: shldq %cl, %r8, %r9
-; AVX512F-NEXT: movq %rdi, %r8
-; AVX512F-NEXT: shldq %cl, %rdx, %r8
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdi, %rsi
-; AVX512F-NEXT: movq %rsi, 56(%rax)
-; AVX512F-NEXT: movq %r8, 48(%rax)
-; AVX512F-NEXT: movq %r9, 40(%rax)
-; AVX512F-NEXT: movq %rbx, 32(%rax)
-; AVX512F-NEXT: movq %r14, 24(%rax)
-; AVX512F-NEXT: movq %r10, 16(%rax)
-; AVX512F-NEXT: movq %r15, 8(%rax)
-; AVX512F-NEXT: movq %r11, (%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2
+; AVX512F-NEXT: movq %rsi, %rdx
+; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rdx, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT: vpand %xmm1, %xmm3, %xmm4
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm6 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandn %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm6[1,2,3,4,5,6,7],zmm5[0]
+; AVX512F-NEXT: vpaddq %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT: vpsllq %xmm1, %zmm3, %zmm1
+; AVX512F-NEXT: xorl %ecx, %ecx
+; AVX512F-NEXT: negq %rsi
+; AVX512F-NEXT: sbbl %ecx, %ecx
+; AVX512F-NEXT: vporq %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshl_rot_i512_vector:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rbx
-; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VL-NEXT: vmovq %xmm2, %rsi
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512VL-NEXT: vpextrq $1, %xmm2, %r10
-; AVX512VL-NEXT: vmovq %xmm3, %r14
-; AVX512VL-NEXT: vpextrq $1, %xmm3, %r15
-; AVX512VL-NEXT: vmovq %xmm0, %r9
-; AVX512VL-NEXT: vmovq %xmm1, %r12
-; AVX512VL-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT: movq %r12, %r11
-; AVX512VL-NEXT: cmovneq %r9, %r11
-; AVX512VL-NEXT: movq %r15, %r13
-; AVX512VL-NEXT: cmovneq %r10, %r13
-; AVX512VL-NEXT: movq %r14, %rax
-; AVX512VL-NEXT: cmovneq %rsi, %rax
-; AVX512VL-NEXT: movq %rbx, %r8
-; AVX512VL-NEXT: cmovneq %rdx, %r8
-; AVX512VL-NEXT: cmovneq %r12, %r9
-; AVX512VL-NEXT: cmovneq %r14, %rsi
-; AVX512VL-NEXT: cmovneq %rbx, %rdx
-; AVX512VL-NEXT: cmovneq %r15, %r10
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: movq %r8, %r14
-; AVX512VL-NEXT: cmovneq %r10, %r14
-; AVX512VL-NEXT: cmovneq %rdx, %r10
-; AVX512VL-NEXT: movq %rax, %rbx
-; AVX512VL-NEXT: cmovneq %r9, %rbx
-; AVX512VL-NEXT: cmovneq %rsi, %r9
-; AVX512VL-NEXT: cmoveq %r13, %r8
-; AVX512VL-NEXT: cmoveq %r11, %rax
-; AVX512VL-NEXT: cmovneq %r13, %rdx
-; AVX512VL-NEXT: cmovneq %r11, %rsi
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: movq %rsi, %r11
-; AVX512VL-NEXT: cmovneq %rdx, %r11
-; AVX512VL-NEXT: cmovneq %rax, %rdx
-; AVX512VL-NEXT: cmovneq %r8, %rax
-; AVX512VL-NEXT: cmovneq %rbx, %r8
-; AVX512VL-NEXT: cmovneq %r14, %rbx
-; AVX512VL-NEXT: cmovneq %r9, %r14
-; AVX512VL-NEXT: cmoveq %r10, %rsi
-; AVX512VL-NEXT: cmovneq %r10, %r9
-; AVX512VL-NEXT: movq %r9, %r10
-; AVX512VL-NEXT: shldq %cl, %rsi, %r10
-; AVX512VL-NEXT: movq %r14, %r15
-; AVX512VL-NEXT: shldq %cl, %r9, %r15
-; AVX512VL-NEXT: movq %rbx, %r9
-; AVX512VL-NEXT: shldq %cl, %r14, %r9
-; AVX512VL-NEXT: movq %r8, %r14
-; AVX512VL-NEXT: shldq %cl, %rbx, %r14
-; AVX512VL-NEXT: movq %rax, %rbx
-; AVX512VL-NEXT: shldq %cl, %r8, %rbx
-; AVX512VL-NEXT: movq %rdx, %r8
-; AVX512VL-NEXT: shldq %cl, %rax, %r8
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %r11, %rdi
-; AVX512VL-NEXT: shldq %cl, %rdx, %rdi
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %r11, %rsi
-; AVX512VL-NEXT: movq %rsi, 56(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r8, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r14, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 16(%rax)
-; AVX512VL-NEXT: movq %r15, 8(%rax)
-; AVX512VL-NEXT: movq %r10, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpexpandq %zmm0, %zmm4 {%k1} {z}
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: valignq {{.*#+}} zmm4 = zmm5[7],zmm4[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm4, %zmm4
+; AVX512VL-NEXT: vpsrlq %xmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: movq %rsi, %rcx
+; AVX512VL-NEXT: xorq $511, %rcx # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT: vpand %xmm2, %xmm3, %xmm4
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm6 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT: valignq {{.*#+}} zmm3 = zmm6[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm3, %zmm3
+; AVX512VL-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vporq %zmm4, %zmm2, %zmm2
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: negq %rsi
+; AVX512VL-NEXT: sbbl %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshl_rot_i512_vector:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %rbx
-; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vmovq %xmm2, %rsi
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512VBMI-NEXT: vpextrq $1, %xmm2, %r10
-; AVX512VBMI-NEXT: vmovq %xmm3, %r14
-; AVX512VBMI-NEXT: vpextrq $1, %xmm3, %r15
-; AVX512VBMI-NEXT: vmovq %xmm0, %r9
-; AVX512VBMI-NEXT: vmovq %xmm1, %r12
-; AVX512VBMI-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT: movq %r12, %r11
-; AVX512VBMI-NEXT: cmovneq %r9, %r11
-; AVX512VBMI-NEXT: movq %r15, %r13
-; AVX512VBMI-NEXT: cmovneq %r10, %r13
-; AVX512VBMI-NEXT: movq %r14, %rax
-; AVX512VBMI-NEXT: cmovneq %rsi, %rax
-; AVX512VBMI-NEXT: movq %rbx, %r8
-; AVX512VBMI-NEXT: cmovneq %rdx, %r8
-; AVX512VBMI-NEXT: cmovneq %r12, %r9
-; AVX512VBMI-NEXT: cmovneq %r14, %rsi
-; AVX512VBMI-NEXT: cmovneq %rbx, %rdx
-; AVX512VBMI-NEXT: cmovneq %r15, %r10
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: movq %r8, %r14
-; AVX512VBMI-NEXT: cmovneq %r10, %r14
-; AVX512VBMI-NEXT: cmovneq %rdx, %r10
-; AVX512VBMI-NEXT: movq %rax, %rbx
-; AVX512VBMI-NEXT: cmovneq %r9, %rbx
-; AVX512VBMI-NEXT: cmovneq %rsi, %r9
-; AVX512VBMI-NEXT: cmoveq %r13, %r8
-; AVX512VBMI-NEXT: cmoveq %r11, %rax
-; AVX512VBMI-NEXT: cmovneq %r13, %rdx
-; AVX512VBMI-NEXT: cmovneq %r11, %rsi
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: movq %rsi, %r11
-; AVX512VBMI-NEXT: cmovneq %rdx, %r11
-; AVX512VBMI-NEXT: cmovneq %rax, %rdx
-; AVX512VBMI-NEXT: cmovneq %r8, %rax
-; AVX512VBMI-NEXT: cmovneq %rbx, %r8
-; AVX512VBMI-NEXT: cmovneq %r14, %rbx
-; AVX512VBMI-NEXT: cmovneq %r9, %r14
-; AVX512VBMI-NEXT: cmoveq %r10, %rsi
-; AVX512VBMI-NEXT: cmovneq %r10, %r9
-; AVX512VBMI-NEXT: movq %r9, %r10
-; AVX512VBMI-NEXT: shldq %cl, %rsi, %r10
-; AVX512VBMI-NEXT: movq %r14, %r15
-; AVX512VBMI-NEXT: shldq %cl, %r9, %r15
-; AVX512VBMI-NEXT: movq %rbx, %r9
-; AVX512VBMI-NEXT: shldq %cl, %r14, %r9
-; AVX512VBMI-NEXT: movq %r8, %r14
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %r14
-; AVX512VBMI-NEXT: movq %rax, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r8, %rbx
-; AVX512VBMI-NEXT: movq %rdx, %r8
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq %r11, %rdi
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %rdi
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rsi
-; AVX512VBMI-NEXT: movq %rsi, 56(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 48(%rax)
-; AVX512VBMI-NEXT: movq %r8, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT: movq %r14, 24(%rax)
-; AVX512VBMI-NEXT: movq %r9, 16(%rax)
-; AVX512VBMI-NEXT: movq %r15, 8(%rax)
-; AVX512VBMI-NEXT: movq %r10, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm2[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm4
+; AVX512VBMI-NEXT: vpshldvq %zmm4, %zmm3, %zmm1
+; AVX512VBMI-NEXT: movq %rsi, %rcx
+; AVX512VBMI-NEXT: xorq $511, %rcx # imm = 0x1FF
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm3
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm4 {%k1} {z}
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm4[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm3, %zmm2, %zmm4
+; AVX512VBMI-NEXT: xorl %ecx, %ecx
+; AVX512VBMI-NEXT: negq %rsi
+; AVX512VBMI-NEXT: sbbl %ecx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vporq %zmm4, %zmm1, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = bitcast <8 x i64> %v0 to i512
@@ -5200,240 +4549,123 @@ define i512 @fshr_rot_i512_vector(<8 x i64> %v0, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshr_rot_i512_vector:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512F-NEXT: vmovq %xmm2, %r8
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512F-NEXT: vpextrq $1, %xmm2, %r14
-; AVX512F-NEXT: vpextrq $1, %xmm3, %r11
-; AVX512F-NEXT: vmovq %xmm3, %rbx
-; AVX512F-NEXT: vmovq %xmm0, %rsi
-; AVX512F-NEXT: vmovq %xmm1, %r12
-; AVX512F-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT: movq %r12, %r15
-; AVX512F-NEXT: cmoveq %rsi, %r15
-; AVX512F-NEXT: movq %r11, %r13
-; AVX512F-NEXT: cmoveq %r14, %r13
-; AVX512F-NEXT: movq %rbx, %r9
-; AVX512F-NEXT: cmoveq %r8, %r9
-; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: cmoveq %rdx, %r10
-; AVX512F-NEXT: cmoveq %r12, %rsi
-; AVX512F-NEXT: cmoveq %rbx, %r8
-; AVX512F-NEXT: cmoveq %rax, %rdx
-; AVX512F-NEXT: cmoveq %r11, %r14
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: movq %r10, %r11
-; AVX512F-NEXT: cmoveq %r14, %r11
-; AVX512F-NEXT: cmoveq %rdx, %r14
-; AVX512F-NEXT: movq %r9, %rbx
-; AVX512F-NEXT: cmoveq %rsi, %rbx
-; AVX512F-NEXT: cmoveq %r8, %rsi
-; AVX512F-NEXT: cmovneq %r13, %r10
-; AVX512F-NEXT: cmovneq %r15, %r9
-; AVX512F-NEXT: cmoveq %r13, %rdx
+; AVX512F-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rsi, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpand %xmm1, %xmm2, %xmm3
+; AVX512F-NEXT: movl %esi, %eax
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %eax, %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm4 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpandn %xmm1, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512F-NEXT: vpaddq %zmm4, %zmm4, %zmm4
+; AVX512F-NEXT: vpsllq %xmm2, %zmm4, %zmm2
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: cmoveq %r15, %r8
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: movq %r8, %rdi
-; AVX512F-NEXT: cmoveq %rdx, %rdi
-; AVX512F-NEXT: cmoveq %r9, %rdx
-; AVX512F-NEXT: cmoveq %r10, %r9
-; AVX512F-NEXT: cmoveq %rbx, %r10
-; AVX512F-NEXT: cmoveq %r11, %rbx
-; AVX512F-NEXT: cmoveq %rsi, %r11
-; AVX512F-NEXT: cmoveq %r14, %rsi
-; AVX512F-NEXT: cmovneq %r14, %r8
-; AVX512F-NEXT: movq %r8, %r14
-; AVX512F-NEXT: shrdq %cl, %rsi, %r14
-; AVX512F-NEXT: shrdq %cl, %r11, %rsi
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: shrdq %cl, %r10, %rbx
-; AVX512F-NEXT: shrdq %cl, %r9, %r10
-; AVX512F-NEXT: shrdq %cl, %rdx, %r9
-; AVX512F-NEXT: shrdq %cl, %rdi, %rdx
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shrdq %cl, %r8, %rdi
-; AVX512F-NEXT: movq %rdi, 56(%rax)
-; AVX512F-NEXT: movq %rdx, 48(%rax)
-; AVX512F-NEXT: movq %r9, 40(%rax)
-; AVX512F-NEXT: movq %r10, 32(%rax)
-; AVX512F-NEXT: movq %rbx, 24(%rax)
-; AVX512F-NEXT: movq %r11, 16(%rax)
-; AVX512F-NEXT: movq %rsi, 8(%rax)
-; AVX512F-NEXT: movq %r14, (%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: movq %rsi, %rdx
+; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rdx, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT: vpand %xmm1, %xmm3, %xmm4
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpexpandq %zmm0, %zmm6 {%k1} {z}
+; AVX512F-NEXT: vpsllq %xmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandn %xmm1, %xmm3, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm5[7],zmm6[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm3, %zmm3
+; AVX512F-NEXT: vpsrlq %xmm1, %zmm3, %zmm1
+; AVX512F-NEXT: xorl %ecx, %ecx
+; AVX512F-NEXT: negq %rsi
+; AVX512F-NEXT: sbbl %ecx, %ecx
+; AVX512F-NEXT: vporq %zmm1, %zmm4, %zmm1
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshr_rot_i512_vector:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %r11
-; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VL-NEXT: vmovq %xmm2, %r8
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512VL-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512VL-NEXT: vmovq %xmm3, %rbx
-; AVX512VL-NEXT: vpextrq $1, %xmm3, %r15
-; AVX512VL-NEXT: vmovq %xmm0, %rsi
-; AVX512VL-NEXT: vmovq %xmm1, %r12
-; AVX512VL-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT: movq %r12, %r14
-; AVX512VL-NEXT: cmoveq %rsi, %r14
-; AVX512VL-NEXT: movq %r15, %r13
-; AVX512VL-NEXT: cmoveq %rax, %r13
-; AVX512VL-NEXT: movq %rbx, %r9
-; AVX512VL-NEXT: cmoveq %r8, %r9
-; AVX512VL-NEXT: movq %r11, %r10
-; AVX512VL-NEXT: cmoveq %rdx, %r10
-; AVX512VL-NEXT: cmoveq %r12, %rsi
-; AVX512VL-NEXT: cmoveq %rbx, %r8
-; AVX512VL-NEXT: cmoveq %r11, %rdx
-; AVX512VL-NEXT: cmoveq %r15, %rax
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: movq %r10, %r11
-; AVX512VL-NEXT: cmoveq %rax, %r11
-; AVX512VL-NEXT: cmoveq %rdx, %rax
-; AVX512VL-NEXT: movq %r9, %rbx
-; AVX512VL-NEXT: cmoveq %rsi, %rbx
-; AVX512VL-NEXT: cmoveq %r8, %rsi
-; AVX512VL-NEXT: cmovneq %r13, %r10
-; AVX512VL-NEXT: cmovneq %r14, %r9
-; AVX512VL-NEXT: cmoveq %r13, %rdx
-; AVX512VL-NEXT: cmoveq %r14, %r8
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: movq %r8, %r14
-; AVX512VL-NEXT: cmoveq %rdx, %r14
-; AVX512VL-NEXT: cmoveq %r9, %rdx
-; AVX512VL-NEXT: cmoveq %r10, %r9
-; AVX512VL-NEXT: cmoveq %rbx, %r10
-; AVX512VL-NEXT: cmoveq %r11, %rbx
-; AVX512VL-NEXT: cmoveq %rsi, %r11
-; AVX512VL-NEXT: cmoveq %rax, %rsi
-; AVX512VL-NEXT: cmovneq %rax, %r8
-; AVX512VL-NEXT: movq %r8, %r15
-; AVX512VL-NEXT: shrdq %cl, %rsi, %r15
-; AVX512VL-NEXT: shrdq %cl, %r11, %rsi
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: shrdq %cl, %r10, %rbx
-; AVX512VL-NEXT: shrdq %cl, %r9, %r10
-; AVX512VL-NEXT: shrdq %cl, %rdx, %r9
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shrdq %cl, %r14, %rdx
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shrdq %cl, %r8, %r14
-; AVX512VL-NEXT: movq %r14, 56(%rdi)
-; AVX512VL-NEXT: movq %rdx, 48(%rdi)
-; AVX512VL-NEXT: movq %r9, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %rbx, 24(%rdi)
-; AVX512VL-NEXT: movq %r11, 16(%rdi)
-; AVX512VL-NEXT: movq %rsi, 8(%rdi)
-; AVX512VL-NEXT: movq %r15, (%rdi)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: movl %esi, %ecx
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm4 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT: vpaddq %zmm4, %zmm4, %zmm4
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vporq %zmm3, %zmm1, %zmm1
+; AVX512VL-NEXT: movq %rsi, %rcx
+; AVX512VL-NEXT: xorq $511, %rcx # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT: vpand %xmm2, %xmm3, %xmm4
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpexpandq %zmm0, %zmm6 {%k1} {z}
+; AVX512VL-NEXT: vpsllq %xmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT: valignq {{.*#+}} zmm3 = zmm5[7],zmm6[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm3, %zmm3
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vporq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: negq %rsi
+; AVX512VL-NEXT: sbbl %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshr_rot_i512_vector:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm1
-; AVX512VBMI-NEXT: vpextrq $1, %xmm1, %rdx
-; AVX512VBMI-NEXT: vpextrq $1, %xmm0, %r11
-; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vmovq %xmm2, %r8
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm3
-; AVX512VBMI-NEXT: vpextrq $1, %xmm2, %rax
-; AVX512VBMI-NEXT: vmovq %xmm3, %rbx
-; AVX512VBMI-NEXT: vpextrq $1, %xmm3, %r15
-; AVX512VBMI-NEXT: vmovq %xmm0, %rsi
-; AVX512VBMI-NEXT: vmovq %xmm1, %r12
-; AVX512VBMI-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT: movq %r12, %r14
-; AVX512VBMI-NEXT: cmoveq %rsi, %r14
-; AVX512VBMI-NEXT: movq %r15, %r13
-; AVX512VBMI-NEXT: cmoveq %rax, %r13
-; AVX512VBMI-NEXT: movq %rbx, %r9
-; AVX512VBMI-NEXT: cmoveq %r8, %r9
-; AVX512VBMI-NEXT: movq %r11, %r10
-; AVX512VBMI-NEXT: cmoveq %rdx, %r10
-; AVX512VBMI-NEXT: cmoveq %r12, %rsi
-; AVX512VBMI-NEXT: cmoveq %rbx, %r8
-; AVX512VBMI-NEXT: cmoveq %r11, %rdx
-; AVX512VBMI-NEXT: cmoveq %r15, %rax
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: movq %r10, %r11
-; AVX512VBMI-NEXT: cmoveq %rax, %r11
-; AVX512VBMI-NEXT: cmoveq %rdx, %rax
-; AVX512VBMI-NEXT: movq %r9, %rbx
-; AVX512VBMI-NEXT: cmoveq %rsi, %rbx
-; AVX512VBMI-NEXT: cmoveq %r8, %rsi
-; AVX512VBMI-NEXT: cmovneq %r13, %r10
-; AVX512VBMI-NEXT: cmovneq %r14, %r9
-; AVX512VBMI-NEXT: cmoveq %r13, %rdx
-; AVX512VBMI-NEXT: cmoveq %r14, %r8
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: movq %r8, %r14
-; AVX512VBMI-NEXT: cmoveq %rdx, %r14
-; AVX512VBMI-NEXT: cmoveq %r9, %rdx
-; AVX512VBMI-NEXT: cmoveq %r10, %r9
-; AVX512VBMI-NEXT: cmoveq %rbx, %r10
-; AVX512VBMI-NEXT: cmoveq %r11, %rbx
-; AVX512VBMI-NEXT: cmoveq %rsi, %r11
-; AVX512VBMI-NEXT: cmoveq %rax, %rsi
-; AVX512VBMI-NEXT: cmovneq %rax, %r8
-; AVX512VBMI-NEXT: movq %r8, %r15
-; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r15
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %r10
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r9
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %rdx
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %r14
-; AVX512VBMI-NEXT: movq %r14, 56(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 48(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 24(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 16(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r15, (%rdi)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: andl $511, %esi # imm = 0x1FF
+; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %edx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm1[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm4
+; AVX512VBMI-NEXT: vpshrdvq %zmm4, %zmm3, %zmm1
+; AVX512VBMI-NEXT: movq %rsi, %rcx
+; AVX512VBMI-NEXT: xorq $511, %rcx # imm = 0x1FF
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm3
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm4 {%k1} {z}
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm4[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm3, %zmm2, %zmm4
+; AVX512VBMI-NEXT: xorl %ecx, %ecx
+; AVX512VBMI-NEXT: negq %rsi
+; AVX512VBMI-NEXT: sbbl %ecx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vporq %zmm1, %zmm4, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = bitcast <8 x i64> %v0 to i512
>From b61e64972740ba1bf5fc140d7b7a47345e37558c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 10 Mar 2026 13:15:38 +0000
Subject: [PATCH 2/4] Fix InvAmt flip
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 4 +-
llvm/test/CodeGen/X86/funnel-shift-i512.ll | 622 +++++++++++----------
2 files changed, 315 insertions(+), 311 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 62871e0743c0c..f11c3c97be0fa 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34566,8 +34566,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue AmtZ = DAG.getSetCC(dl, MVT::i1, Amt, DAG.getConstant(0, dl, AmtVT),
ISD::SETNE);
SDValue Sel = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, AmtZ);
- SDValue InvAmt = DAG.getNode(ISD::SUB, dl, AmtVT,
- DAG.getConstant(BW - 1, dl, AmtVT), Amt);
+ SDValue InvAmt =
+ DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BW, dl, AmtVT), Amt);
SDValue ShX =
DAG.getNode(ISD::SHL, dl, VT, Op0, Opc == ISD::FSHL ? Amt : InvAmt);
SDValue ShY =
diff --git a/llvm/test/CodeGen/X86/funnel-shift-i512.ll b/llvm/test/CodeGen/X86/funnel-shift-i512.ll
index 35c67a696dfef..c6105d2170cf1 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-i512.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-i512.ll
@@ -1841,49 +1841,50 @@ define i512 @fshl_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
; AVX512F-NEXT: vmovups (%rdx), %zmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: vmovdqu64 %zmm2, {{[0-9]+}}(%rsp)
; AVX512F-NEXT: vmovups %zmm1, (%rsp)
-; AVX512F-NEXT: movl %ecx, %eax
-; AVX512F-NEXT: andl $63, %eax
-; AVX512F-NEXT: vmovq %rax, %xmm1
-; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpandn %xmm3, %xmm1, %xmm4
-; AVX512F-NEXT: movl %ecx, %eax
-; AVX512F-NEXT: shrl $3, %eax
-; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: leaq -{{[0-9]+}}(%rsp), %rdx
-; AVX512F-NEXT: subq %rax, %rdx
-; AVX512F-NEXT: vmovdqu64 (%rdx), %zmm5
-; AVX512F-NEXT: valignq {{.*#+}} zmm6 = zmm2[7],zmm5[0,1,2,3,4,5,6]
-; AVX512F-NEXT: vpsrlq $1, %zmm6, %zmm6
-; AVX512F-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
-; AVX512F-NEXT: vpsllq %xmm1, %zmm5, %zmm1
+; AVX512F-NEXT: vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movl %ecx, %edx
+; AVX512F-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512F-NEXT: movl $512, %esi # imm = 0x200
+; AVX512F-NEXT: subq %rdx, %rsi
+; AVX512F-NEXT: movl %esi, %edi
+; AVX512F-NEXT: andl $63, %edi
+; AVX512F-NEXT: vmovq %rdi, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: shrl $3, %esi
+; AVX512F-NEXT: andl $56, %esi
+; AVX512F-NEXT: vmovdqu64 (%rsp,%rsi), %zmm3
+; AVX512F-NEXT: vpsrlq %xmm1, %zmm3, %zmm4
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm5 = [63,63]
+; AVX512F-NEXT: vpandn %xmm5, %xmm1, %xmm1
+; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm3[1,2,3,4,5,6,7],zmm2[0]
+; AVX512F-NEXT: vpaddq %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT: vpsllq %xmm1, %zmm3, %zmm1
; AVX512F-NEXT: vporq %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: andl $511, %ecx # imm = 0x1FF
-; AVX512F-NEXT: movq %rcx, %rdx
-; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
-; AVX512F-NEXT: movl %edx, %esi
+; AVX512F-NEXT: movl %ecx, %esi
; AVX512F-NEXT: andl $63, %esi
-; AVX512F-NEXT: vmovq %rsi, %xmm4
-; AVX512F-NEXT: vpbroadcastq %xmm4, %xmm4
-; AVX512F-NEXT: vpandn %xmm3, %xmm4, %xmm3
-; AVX512F-NEXT: shrl $6, %edx
-; AVX512F-NEXT: vmovdqu64 (%rsp,%rdx,8), %zmm5
-; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm5[1,2,3,4,5,6,7],zmm2[0]
-; AVX512F-NEXT: vpaddq %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT: vpsllq %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrlq %xmm4, %zmm5, %zmm3
-; AVX512F-NEXT: xorl %edx, %edx
-; AVX512F-NEXT: negq %rcx
-; AVX512F-NEXT: sbbl %edx, %edx
-; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT: kmovw %edx, %k1
-; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: vmovq %rsi, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT: vpandn %xmm5, %xmm3, %xmm4
+; AVX512F-NEXT: shrl $3, %ecx
+; AVX512F-NEXT: andl $56, %ecx
+; AVX512F-NEXT: leaq -{{[0-9]+}}(%rsp), %rsi
+; AVX512F-NEXT: subq %rcx, %rsi
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm5[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq %xmm3, %zmm5, %zmm3
+; AVX512F-NEXT: xorl %ecx, %ecx
+; AVX512F-NEXT: negq %rdx
+; AVX512F-NEXT: sbbl %ecx, %ecx
+; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512F-NEXT: addq $136, %rsp
; AVX512F-NEXT: retq
;
@@ -1893,53 +1894,54 @@ define i512 @fshl_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
; AVX512VL-NEXT: vmovups (%rdx), %ymm2
-; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: vmovups 32(%rdx), %ymm3
; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovups %ymm2, (%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %ecx, %edx
-; AVX512VL-NEXT: andl $63, %edx
-; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm1
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
-; AVX512VL-NEXT: movl %ecx, %edx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: leaq -{{[0-9]+}}(%rsp), %rsi
-; AVX512VL-NEXT: subq %rdx, %rsi
-; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm3
-; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm4
+; AVX512VL-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512VL-NEXT: movl $512, %eax # imm = 0x200
+; AVX512VL-NEXT: subq %rdx, %rax
+; AVX512VL-NEXT: movl %eax, %esi
+; AVX512VL-NEXT: andl $63, %esi
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm1
+; AVX512VL-NEXT: shrl $3, %eax
+; AVX512VL-NEXT: andl $56, %eax
+; AVX512VL-NEXT: vmovdqu64 (%rsp,%rax), %zmm2
+; AVX512VL-NEXT: vpsrlq %xmm1, %zmm2, %zmm3
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm4 = [63,63]
+; AVX512VL-NEXT: vpandn %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: valignq {{.*#+}} zmm6 = zmm5[7],zmm3[0,1,2,3,4,5,6]
-; AVX512VL-NEXT: vpsrlq $1, %zmm6, %zmm6
-; AVX512VL-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
-; AVX512VL-NEXT: vpsllq %xmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: vporq %zmm4, %zmm1, %zmm1
-; AVX512VL-NEXT: andl $511, %ecx # imm = 0x1FF
-; AVX512VL-NEXT: movq %rcx, %rdx
-; AVX512VL-NEXT: xorq $511, %rdx # imm = 0x1FF
-; AVX512VL-NEXT: movl %edx, %esi
+; AVX512VL-NEXT: valignq {{.*#+}} zmm2 = zmm2[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: vporq %zmm3, %zmm1, %zmm1
+; AVX512VL-NEXT: movl %ecx, %esi
; AVX512VL-NEXT: andl $63, %esi
-; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm3
-; AVX512VL-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX512VL-NEXT: shrl $6, %edx
-; AVX512VL-NEXT: vmovdqu64 (%rsp,%rdx,8), %zmm4
-; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm4[1,2,3,4,5,6,7],zmm5[0]
-; AVX512VL-NEXT: vpaddq %zmm5, %zmm5, %zmm5
-; AVX512VL-NEXT: vpsllq %xmm2, %zmm5, %zmm2
-; AVX512VL-NEXT: vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm2
+; AVX512VL-NEXT: vpandn %xmm4, %xmm2, %xmm3
+; AVX512VL-NEXT: shrl $3, %ecx
+; AVX512VL-NEXT: andl $56, %ecx
+; AVX512VL-NEXT: leaq -{{[0-9]+}}(%rsp), %rsi
+; AVX512VL-NEXT: subq %rcx, %rsi
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm4
+; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm5[7],zmm4[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm5, %zmm5
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm5, %zmm3
+; AVX512VL-NEXT: vpsllq %xmm2, %zmm4, %zmm2
; AVX512VL-NEXT: vporq %zmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: xorl %edx, %edx
-; AVX512VL-NEXT: negq %rcx
-; AVX512VL-NEXT: sbbl %edx, %edx
-; AVX512VL-NEXT: kmovd %edx, %k1
-; AVX512VL-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: negq %rdx
+; AVX512VL-NEXT: sbbl %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: addq $136, %rsp
; AVX512VL-NEXT: vzeroupper
@@ -1950,43 +1952,45 @@ define i512 @fshl_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
; AVX512VBMI-NEXT: subq $136, %rsp
; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VBMI-NEXT: vmovdqu (%rdx), %ymm2
-; AVX512VBMI-NEXT: vmovups 32(%rdx), %ymm3
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX512VBMI-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups (%rdx), %ymm2
+; AVX512VBMI-NEXT: vmovups 32(%rdx), %ymm3
+; AVX512VBMI-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI-NEXT: vmovdqu %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm2, (%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm2, (%rsp)
+; AVX512VBMI-NEXT: movl %ecx, %esi
; AVX512VBMI-NEXT: movl %ecx, %edx
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: leaq -{{[0-9]+}}(%rsp), %rsi
-; AVX512VBMI-NEXT: subq %rdx, %rsi
-; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm1
-; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm2[7],zmm1[0,1,2,3,4,5,6]
-; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm4
-; AVX512VBMI-NEXT: vpshldvq %zmm4, %zmm3, %zmm1
-; AVX512VBMI-NEXT: andl $511, %ecx # imm = 0x1FF
-; AVX512VBMI-NEXT: movq %rcx, %rdx
-; AVX512VBMI-NEXT: xorq $511, %rdx # imm = 0x1FF
-; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm3
-; AVX512VBMI-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
-; AVX512VBMI-NEXT: shrl $6, %edx
-; AVX512VBMI-NEXT: vmovdqu64 (%rsp,%rdx,8), %zmm4
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm4[1,2,3,4,5,6,7],zmm2[0]
-; AVX512VBMI-NEXT: vpshrdvq %zmm3, %zmm2, %zmm4
-; AVX512VBMI-NEXT: xorl %edx, %edx
-; AVX512VBMI-NEXT: negq %rcx
-; AVX512VBMI-NEXT: sbbl %edx, %edx
-; AVX512VBMI-NEXT: kmovd %edx, %k1
-; AVX512VBMI-NEXT: vporq %zmm4, %zmm1, %zmm0 {%k1}
-; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VBMI-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512VBMI-NEXT: movl $512, %edi # imm = 0x200
+; AVX512VBMI-NEXT: subq %rdx, %rdi
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm1
+; AVX512VBMI-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
+; AVX512VBMI-NEXT: shrl $3, %edi
+; AVX512VBMI-NEXT: andl $56, %edi
+; AVX512VBMI-NEXT: vmovdqu64 (%rsp,%rdi), %zmm2
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm3
+; AVX512VBMI-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm5 = zmm2[1,2,3,4,5,6,7],zmm4[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm5, %zmm2
+; AVX512VBMI-NEXT: shrl $3, %esi
+; AVX512VBMI-NEXT: andl $56, %esi
+; AVX512VBMI-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
+; AVX512VBMI-NEXT: subq %rsi, %rcx
+; AVX512VBMI-NEXT: vmovdqu64 (%rcx), %zmm1
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm4[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm3, %zmm4, %zmm1
+; AVX512VBMI-NEXT: xorl %ecx, %ecx
+; AVX512VBMI-NEXT: negq %rdx
+; AVX512VBMI-NEXT: sbbl %ecx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VBMI-NEXT: addq $136, %rsp
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
@@ -2160,53 +2164,53 @@ define i512 @fshr_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
; AVX512F-LABEL: fshr_i512_load:
; AVX512F: # %bb.0:
; AVX512F-NEXT: subq $136, %rsp
-; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: vmovups (%rsi), %zmm0
; AVX512F-NEXT: vmovdqu64 (%rdx), %zmm1
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp)
; AVX512F-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %ecx, %edx
-; AVX512F-NEXT: andl $63, %edx
-; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512F-NEXT: movl $512, %esi # imm = 0x200
+; AVX512F-NEXT: subq %rdx, %rsi
+; AVX512F-NEXT: movl %esi, %edi
+; AVX512F-NEXT: andl $63, %edi
+; AVX512F-NEXT: vmovq %rdi, %xmm0
; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
-; AVX512F-NEXT: vpandn %xmm3, %xmm0, %xmm4
-; AVX512F-NEXT: movl %ecx, %edx
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $56, %edx
-; AVX512F-NEXT: vmovdqu64 -128(%rsp,%rdx), %zmm5
-; AVX512F-NEXT: valignq {{.*#+}} zmm6 = zmm5[1,2,3,4,5,6,7],zmm2[0]
-; AVX512F-NEXT: vpaddq %zmm6, %zmm6, %zmm6
-; AVX512F-NEXT: vpsllq %xmm4, %zmm6, %zmm4
-; AVX512F-NEXT: vpsrlq %xmm0, %zmm5, %zmm0
+; AVX512F-NEXT: shrl $3, %esi
+; AVX512F-NEXT: andl $56, %esi
+; AVX512F-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
+; AVX512F-NEXT: subq %rsi, %rdi
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm3
+; AVX512F-NEXT: vpsllq %xmm0, %zmm3, %zmm4
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm5 = [63,63]
+; AVX512F-NEXT: vpandn %xmm5, %xmm0, %xmm0
+; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm2[7],zmm3[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm3, %zmm3
+; AVX512F-NEXT: vpsrlq %xmm0, %zmm3, %zmm0
; AVX512F-NEXT: vporq %zmm0, %zmm4, %zmm0
-; AVX512F-NEXT: andl $511, %ecx # imm = 0x1FF
-; AVX512F-NEXT: movq %rcx, %rdx
-; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
-; AVX512F-NEXT: movl %edx, %esi
+; AVX512F-NEXT: movl %ecx, %esi
; AVX512F-NEXT: andl $63, %esi
-; AVX512F-NEXT: vmovq %rsi, %xmm4
-; AVX512F-NEXT: vpbroadcastq %xmm4, %xmm4
-; AVX512F-NEXT: vpandn %xmm3, %xmm4, %xmm3
-; AVX512F-NEXT: shrl $3, %edx
-; AVX512F-NEXT: andl $-8, %edx
-; AVX512F-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT: subq %rdx, %rsi
-; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm5
-; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm5[0,1,2,3,4,5,6]
-; AVX512F-NEXT: vpsrlq $1, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrlq %xmm3, %zmm2, %zmm2
-; AVX512F-NEXT: vpsllq %xmm4, %zmm5, %zmm3
-; AVX512F-NEXT: xorl %edx, %edx
-; AVX512F-NEXT: negq %rcx
-; AVX512F-NEXT: sbbl %edx, %edx
-; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2
-; AVX512F-NEXT: kmovw %edx, %k1
-; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm1 {%k1}
-; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
+; AVX512F-NEXT: vmovq %rsi, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT: vpandn %xmm5, %xmm3, %xmm4
+; AVX512F-NEXT: shrl $3, %ecx
+; AVX512F-NEXT: andl $56, %ecx
+; AVX512F-NEXT: vmovdqu64 -128(%rsp,%rcx), %zmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm5[1,2,3,4,5,6,7],zmm2[0]
+; AVX512F-NEXT: vpaddq %zmm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq %xmm4, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq %xmm3, %zmm5, %zmm3
+; AVX512F-NEXT: xorl %ecx, %ecx
+; AVX512F-NEXT: negq %rdx
+; AVX512F-NEXT: sbbl %ecx, %ecx
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rax)
; AVX512F-NEXT: addq $136, %rsp
; AVX512F-NEXT: retq
;
@@ -2218,52 +2222,52 @@ define i512 @fshr_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
; AVX512VL-NEXT: vmovdqu64 (%rdx), %zmm0
; AVX512VL-NEXT: vmovups 32(%rdx), %ymm3
; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovups %ymm4, (%rsp)
; AVX512VL-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp)
; AVX512VL-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movl %ecx, %edx
-; AVX512VL-NEXT: andl $63, %edx
-; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm1
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
-; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %ecx, %edx
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $56, %edx
-; AVX512VL-NEXT: vmovdqu64 -128(%rsp,%rdx), %zmm4
+; AVX512VL-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512VL-NEXT: movl $512, %eax # imm = 0x200
+; AVX512VL-NEXT: subq %rdx, %rax
+; AVX512VL-NEXT: movl %eax, %esi
+; AVX512VL-NEXT: andl $63, %esi
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm1
+; AVX512VL-NEXT: shrl $3, %eax
+; AVX512VL-NEXT: andl $56, %eax
+; AVX512VL-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
+; AVX512VL-NEXT: subq %rax, %rsi
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm2
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm2, %zmm3
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm4 = [63,63]
+; AVX512VL-NEXT: vpandn %xmm4, %xmm1, %xmm1
; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: valignq {{.*#+}} zmm6 = zmm4[1,2,3,4,5,6,7],zmm5[0]
-; AVX512VL-NEXT: vpaddq %zmm6, %zmm6, %zmm6
-; AVX512VL-NEXT: vpsllq %xmm3, %zmm6, %zmm3
-; AVX512VL-NEXT: vpsrlq %xmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: valignq {{.*#+}} zmm2 = zmm5[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm2, %zmm2
+; AVX512VL-NEXT: vpsrlq %xmm1, %zmm2, %zmm1
; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: andl $511, %ecx # imm = 0x1FF
-; AVX512VL-NEXT: movq %rcx, %rdx
-; AVX512VL-NEXT: xorq $511, %rdx # imm = 0x1FF
-; AVX512VL-NEXT: movl %edx, %esi
+; AVX512VL-NEXT: movl %ecx, %esi
; AVX512VL-NEXT: andl $63, %esi
-; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm3
-; AVX512VL-NEXT: vpandn %xmm2, %xmm3, %xmm2
-; AVX512VL-NEXT: shrl $3, %edx
-; AVX512VL-NEXT: andl $-8, %edx
-; AVX512VL-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX512VL-NEXT: subq %rdx, %rsi
-; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm4
-; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm5[7],zmm4[0,1,2,3,4,5,6]
-; AVX512VL-NEXT: vpsrlq $1, %zmm5, %zmm5
-; AVX512VL-NEXT: vpsrlq %xmm2, %zmm5, %zmm2
-; AVX512VL-NEXT: vpsllq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm2
+; AVX512VL-NEXT: vpandn %xmm4, %xmm2, %xmm3
+; AVX512VL-NEXT: shrl $3, %ecx
+; AVX512VL-NEXT: andl $56, %ecx
+; AVX512VL-NEXT: vmovdqu64 -128(%rsp,%rcx), %zmm4
+; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT: vpaddq %zmm5, %zmm5, %zmm5
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm5, %zmm3
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm4, %zmm2
; AVX512VL-NEXT: vporq %zmm2, %zmm3, %zmm2
-; AVX512VL-NEXT: xorl %edx, %edx
-; AVX512VL-NEXT: negq %rcx
-; AVX512VL-NEXT: sbbl %edx, %edx
-; AVX512VL-NEXT: kmovd %edx, %k1
-; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: negq %rdx
+; AVX512VL-NEXT: sbbl %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VL-NEXT: addq $136, %rsp
; AVX512VL-NEXT: vzeroupper
@@ -2274,6 +2278,7 @@ define i512 @fshr_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
; AVX512VBMI-NEXT: subq $136, %rsp
; AVX512VBMI-NEXT: vmovups (%rsi), %ymm1
; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm2
+; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: vmovdqu64 (%rdx), %zmm0
; AVX512VBMI-NEXT: vmovups 32(%rdx), %ymm3
; AVX512VBMI-NEXT: vpxor %xmm4, %xmm4, %xmm4
@@ -2283,19 +2288,18 @@ define i512 @fshr_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
; AVX512VBMI-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movl %ecx, %edx
; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX512VBMI-NEXT: andl $511, %ecx # imm = 0x1FF
-; AVX512VBMI-NEXT: movq %rcx, %rsi
-; AVX512VBMI-NEXT: xorq $511, %rsi # imm = 0x1FF
+; AVX512VBMI-NEXT: movl $512, %esi # imm = 0x200
+; AVX512VBMI-NEXT: subq %rcx, %rsi
; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm2
; AVX512VBMI-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi
; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $-8, %esi
+; AVX512VBMI-NEXT: andl $56, %esi
; AVX512VBMI-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
; AVX512VBMI-NEXT: subq %rsi, %rdi
; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm3
@@ -3414,16 +3418,17 @@ define i512 @fshl_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshl_i512_vector:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: andl $511, %esi # imm = 0x1FF
; AVX512F-NEXT: vmovq %rsi, %xmm2
; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm3
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm4
-; AVX512F-NEXT: movl %esi, %eax
-; AVX512F-NEXT: shrl $6, %eax
-; AVX512F-NEXT: movl $-1, %ecx
-; AVX512F-NEXT: shlxl %eax, %ecx, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: movl %esi, %ecx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: vpexpandq %zmm0, %zmm5 {%k1} {z}
; AVX512F-NEXT: vpsllq %xmm4, %zmm5, %zmm4
; AVX512F-NEXT: vpandn %xmm2, %xmm3, %xmm3
@@ -3431,15 +3436,14 @@ define i512 @fshl_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
; AVX512F-NEXT: valignq {{.*#+}} zmm5 = zmm6[7],zmm5[0,1,2,3,4,5,6]
; AVX512F-NEXT: vpsrlq $1, %zmm5, %zmm5
; AVX512F-NEXT: vpsrlq %xmm3, %zmm5, %zmm3
-; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: vporq %zmm3, %zmm4, %zmm3
-; AVX512F-NEXT: movq %rsi, %rdx
-; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
-; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512F-NEXT: subq %rsi, %rcx
+; AVX512F-NEXT: vmovq %rcx, %xmm4
; AVX512F-NEXT: vpbroadcastq %xmm4, %xmm4
; AVX512F-NEXT: vpand %xmm2, %xmm4, %xmm5
-; AVX512F-NEXT: shrl $6, %edx
-; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
; AVX512F-NEXT: vpsrlq %xmm5, %zmm1, %zmm5
@@ -3458,16 +3462,15 @@ define i512 @fshl_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
;
; AVX512VL-LABEL: fshl_i512_vector:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: andl $511, %esi # imm = 0x1FF
; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm2
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: shrl $6, %ecx
-; AVX512VL-NEXT: movl $-1, %edx
-; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
-; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: movl %esi, %eax
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %eax, %ecx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vpexpandq %zmm0, %zmm5 {%k1} {z}
; AVX512VL-NEXT: vpsllq %xmm4, %zmm5, %zmm4
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -3475,13 +3478,14 @@ define i512 @fshl_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm6[7],zmm5[0,1,2,3,4,5,6]
; AVX512VL-NEXT: vpsrlq $1, %zmm5, %zmm5
; AVX512VL-NEXT: vpsrlq %xmm2, %zmm5, %zmm2
+; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: vporq %zmm2, %zmm4, %zmm2
-; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: xorq $511, %rcx # imm = 0x1FF
-; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm4
+; AVX512VL-NEXT: movl $512, %edx # imm = 0x200
+; AVX512VL-NEXT: subq %rsi, %rdx
+; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm4
; AVX512VL-NEXT: vpand %xmm3, %xmm4, %xmm5
-; AVX512VL-NEXT: shrl $6, %ecx
-; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: shlxl %edx, %ecx, %ecx
; AVX512VL-NEXT: kmovd %ecx, %k1
; AVX512VL-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
; AVX512VL-NEXT: vpsrlq %xmm5, %zmm1, %zmm5
@@ -3503,26 +3507,26 @@ define i512 @fshl_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: andl $511, %esi # imm = 0x1FF
-; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512VBMI-NEXT: subq %rsi, %rcx
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm2
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
; AVX512VBMI-NEXT: shrl $6, %ecx
; AVX512VBMI-NEXT: movl $-1, %edx
; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm3[7],zmm2[0,1,2,3,4,5,6]
-; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm5
-; AVX512VBMI-NEXT: vpshldvq %zmm5, %zmm4, %zmm2
-; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: xorq $511, %rcx # imm = 0x1FF
-; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm4
-; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm2, %zmm4, %zmm1
+; AVX512VBMI-NEXT: movl %esi, %ecx
; AVX512VBMI-NEXT: shrl $6, %ecx
; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm1[1,2,3,4,5,6,7],zmm3[0]
-; AVX512VBMI-NEXT: vpshrdvq %zmm4, %zmm3, %zmm1
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm3[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm4
+; AVX512VBMI-NEXT: vpshldvq %zmm4, %zmm3, %zmm2
; AVX512VBMI-NEXT: xorl %ecx, %ecx
; AVX512VBMI-NEXT: negq %rsi
; AVX512VBMI-NEXT: sbbl %ecx, %ecx
@@ -3808,16 +3812,17 @@ define i512 @fshr_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshr_i512_vector:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: andl $511, %esi # imm = 0x1FF
; AVX512F-NEXT: vmovq %rsi, %xmm2
; AVX512F-NEXT: vpbroadcastq %xmm2, %xmm3
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
; AVX512F-NEXT: vpand %xmm2, %xmm3, %xmm4
-; AVX512F-NEXT: movl %esi, %eax
-; AVX512F-NEXT: shrl $6, %eax
-; AVX512F-NEXT: movl $-1, %ecx
-; AVX512F-NEXT: shlxl %eax, %ecx, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: movl %esi, %ecx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: vpcompressq %zmm1, %zmm5 {%k1} {z}
; AVX512F-NEXT: vpsrlq %xmm4, %zmm5, %zmm4
; AVX512F-NEXT: vpandn %xmm2, %xmm3, %xmm3
@@ -3825,15 +3830,14 @@ define i512 @fshr_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
; AVX512F-NEXT: valignq {{.*#+}} zmm5 = zmm5[1,2,3,4,5,6,7],zmm6[0]
; AVX512F-NEXT: vpaddq %zmm5, %zmm5, %zmm5
; AVX512F-NEXT: vpsllq %xmm3, %zmm5, %zmm3
-; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: vporq %zmm4, %zmm3, %zmm3
-; AVX512F-NEXT: movq %rsi, %rdx
-; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
-; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512F-NEXT: subq %rsi, %rcx
+; AVX512F-NEXT: vmovq %rcx, %xmm4
; AVX512F-NEXT: vpbroadcastq %xmm4, %xmm4
; AVX512F-NEXT: vpand %xmm2, %xmm4, %xmm5
-; AVX512F-NEXT: shrl $6, %edx
-; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
; AVX512F-NEXT: vpsllq %xmm5, %zmm0, %zmm5
@@ -3852,16 +3856,15 @@ define i512 @fshr_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
;
; AVX512VL-LABEL: fshr_i512_vector:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: andl $511, %esi # imm = 0x1FF
; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm2
; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
-; AVX512VL-NEXT: movl %esi, %ecx
-; AVX512VL-NEXT: shrl $6, %ecx
-; AVX512VL-NEXT: movl $-1, %edx
-; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
-; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: movl %esi, %eax
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %eax, %ecx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
; AVX512VL-NEXT: vpcompressq %zmm1, %zmm5 {%k1} {z}
; AVX512VL-NEXT: vpsrlq %xmm4, %zmm5, %zmm4
; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
@@ -3869,13 +3872,14 @@ define i512 @fshr_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm5[1,2,3,4,5,6,7],zmm6[0]
; AVX512VL-NEXT: vpaddq %zmm5, %zmm5, %zmm5
; AVX512VL-NEXT: vpsllq %xmm2, %zmm5, %zmm2
+; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: vporq %zmm4, %zmm2, %zmm2
-; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: xorq $511, %rcx # imm = 0x1FF
-; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm4
+; AVX512VL-NEXT: movl $512, %edx # imm = 0x200
+; AVX512VL-NEXT: subq %rsi, %rdx
+; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm4
; AVX512VL-NEXT: vpand %xmm3, %xmm4, %xmm5
-; AVX512VL-NEXT: shrl $6, %ecx
-; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: shlxl %edx, %ecx, %ecx
; AVX512VL-NEXT: kmovd %ecx, %k1
; AVX512VL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
; AVX512VL-NEXT: vpsllq %xmm5, %zmm0, %zmm5
@@ -3897,26 +3901,26 @@ define i512 @fshr_i512_vector(<8 x i64> %v0, <8 x i64> %v1, i512 %a2) nounwind {
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: andl $511, %esi # imm = 0x1FF
-; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512VBMI-NEXT: subq %rsi, %rcx
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm2
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
; AVX512VBMI-NEXT: shrl $6, %ecx
; AVX512VBMI-NEXT: movl $-1, %edx
; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm2[1,2,3,4,5,6,7],zmm3[0]
-; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm5
-; AVX512VBMI-NEXT: vpshrdvq %zmm5, %zmm4, %zmm2
-; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: xorq $511, %rcx # imm = 0x1FF
-; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm4
-; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm3[7],zmm0[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm2, %zmm4, %zmm0
+; AVX512VBMI-NEXT: movl %esi, %ecx
; AVX512VBMI-NEXT: shrl $6, %ecx
; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm3[7],zmm0[0,1,2,3,4,5,6]
-; AVX512VBMI-NEXT: vpshldvq %zmm4, %zmm3, %zmm0
+; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm2[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm4
+; AVX512VBMI-NEXT: vpshrdvq %zmm4, %zmm3, %zmm2
; AVX512VBMI-NEXT: xorl %ecx, %ecx
; AVX512VBMI-NEXT: negq %rsi
; AVX512VBMI-NEXT: sbbl %ecx, %ecx
@@ -4187,16 +4191,17 @@ define i512 @fshl_rot_i512_vector(<8 x i64> %v0, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshl_rot_i512_vector:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: andl $511, %esi # imm = 0x1FF
; AVX512F-NEXT: vmovq %rsi, %xmm1
; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
; AVX512F-NEXT: vpand %xmm1, %xmm2, %xmm3
-; AVX512F-NEXT: movl %esi, %eax
-; AVX512F-NEXT: shrl $6, %eax
-; AVX512F-NEXT: movl $-1, %ecx
-; AVX512F-NEXT: shlxl %eax, %ecx, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: movl %esi, %ecx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: vpexpandq %zmm0, %zmm4 {%k1} {z}
; AVX512F-NEXT: vpsllq %xmm3, %zmm4, %zmm3
; AVX512F-NEXT: vpandn %xmm1, %xmm2, %xmm2
@@ -4204,15 +4209,14 @@ define i512 @fshl_rot_i512_vector(<8 x i64> %v0, i512 %a2) nounwind {
; AVX512F-NEXT: valignq {{.*#+}} zmm4 = zmm5[7],zmm4[0,1,2,3,4,5,6]
; AVX512F-NEXT: vpsrlq $1, %zmm4, %zmm4
; AVX512F-NEXT: vpsrlq %xmm2, %zmm4, %zmm2
-; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2
-; AVX512F-NEXT: movq %rsi, %rdx
-; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
-; AVX512F-NEXT: vmovq %rdx, %xmm3
+; AVX512F-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512F-NEXT: subq %rsi, %rcx
+; AVX512F-NEXT: vmovq %rcx, %xmm3
; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
; AVX512F-NEXT: vpand %xmm1, %xmm3, %xmm4
-; AVX512F-NEXT: shrl $6, %edx
-; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: vpcompressq %zmm0, %zmm6 {%k1} {z}
; AVX512F-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
@@ -4249,8 +4253,8 @@ define i512 @fshl_rot_i512_vector(<8 x i64> %v0, i512 %a2) nounwind {
; AVX512VL-NEXT: vpsrlq $1, %zmm4, %zmm4
; AVX512VL-NEXT: vpsrlq %xmm1, %zmm4, %zmm1
; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: xorq $511, %rcx # imm = 0x1FF
+; AVX512VL-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512VL-NEXT: subq %rsi, %rcx
; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
; AVX512VL-NEXT: vpand %xmm2, %xmm3, %xmm4
; AVX512VL-NEXT: shrl $6, %ecx
@@ -4277,31 +4281,31 @@ define i512 @fshl_rot_i512_vector(<8 x i64> %v0, i512 %a2) nounwind {
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: andl $511, %esi # imm = 0x1FF
-; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512VBMI-NEXT: subq %rsi, %rcx
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
; AVX512VBMI-NEXT: shrl $6, %ecx
; AVX512VBMI-NEXT: movl $-1, %edx
; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm1 {%k1} {z}
-; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm2[7],zmm1[0,1,2,3,4,5,6]
-; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm4
-; AVX512VBMI-NEXT: vpshldvq %zmm4, %zmm3, %zmm1
-; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: xorq $511, %rcx # imm = 0x1FF
-; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm3
-; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm2[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: movl %esi, %ecx
; AVX512VBMI-NEXT: shrl $6, %ecx
; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm4 {%k1} {z}
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm4[1,2,3,4,5,6,7],zmm2[0]
-; AVX512VBMI-NEXT: vpshrdvq %zmm3, %zmm2, %zmm4
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm5 {%k1} {z}
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm4, %zmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm1 = zmm3[7],zmm5[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm3
+; AVX512VBMI-NEXT: vpshldvq %zmm3, %zmm1, %zmm5
; AVX512VBMI-NEXT: xorl %ecx, %ecx
; AVX512VBMI-NEXT: negq %rsi
; AVX512VBMI-NEXT: sbbl %ecx, %ecx
; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vporq %zmm4, %zmm1, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vporq %zmm2, %zmm5, %zmm0 {%k1}
; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
@@ -4549,16 +4553,17 @@ define i512 @fshr_rot_i512_vector(<8 x i64> %v0, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshr_rot_i512_vector:
; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: andl $511, %esi # imm = 0x1FF
; AVX512F-NEXT: vmovq %rsi, %xmm1
; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm2
; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
; AVX512F-NEXT: vpand %xmm1, %xmm2, %xmm3
-; AVX512F-NEXT: movl %esi, %eax
-; AVX512F-NEXT: shrl $6, %eax
-; AVX512F-NEXT: movl $-1, %ecx
-; AVX512F-NEXT: shlxl %eax, %ecx, %eax
-; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: movl %esi, %ecx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: vpcompressq %zmm0, %zmm4 {%k1} {z}
; AVX512F-NEXT: vpsrlq %xmm3, %zmm4, %zmm3
; AVX512F-NEXT: vpandn %xmm1, %xmm2, %xmm2
@@ -4566,15 +4571,14 @@ define i512 @fshr_rot_i512_vector(<8 x i64> %v0, i512 %a2) nounwind {
; AVX512F-NEXT: valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
; AVX512F-NEXT: vpaddq %zmm4, %zmm4, %zmm4
; AVX512F-NEXT: vpsllq %xmm2, %zmm4, %zmm2
-; AVX512F-NEXT: movq %rdi, %rax
; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT: movq %rsi, %rdx
-; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
-; AVX512F-NEXT: vmovq %rdx, %xmm3
+; AVX512F-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512F-NEXT: subq %rsi, %rcx
+; AVX512F-NEXT: vmovq %rcx, %xmm3
; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
; AVX512F-NEXT: vpand %xmm1, %xmm3, %xmm4
-; AVX512F-NEXT: shrl $6, %edx
-; AVX512F-NEXT: shlxl %edx, %ecx, %ecx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: vpexpandq %zmm0, %zmm6 {%k1} {z}
; AVX512F-NEXT: vpsllq %xmm4, %zmm6, %zmm4
@@ -4611,8 +4615,8 @@ define i512 @fshr_rot_i512_vector(<8 x i64> %v0, i512 %a2) nounwind {
; AVX512VL-NEXT: vpaddq %zmm4, %zmm4, %zmm4
; AVX512VL-NEXT: vpsllq %xmm1, %zmm4, %zmm1
; AVX512VL-NEXT: vporq %zmm3, %zmm1, %zmm1
-; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: xorq $511, %rcx # imm = 0x1FF
+; AVX512VL-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512VL-NEXT: subq %rsi, %rcx
; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
; AVX512VL-NEXT: vpand %xmm2, %xmm3, %xmm4
; AVX512VL-NEXT: shrl $6, %ecx
@@ -4639,31 +4643,31 @@ define i512 @fshr_rot_i512_vector(<8 x i64> %v0, i512 %a2) nounwind {
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: movq %rdi, %rax
; AVX512VBMI-NEXT: andl $511, %esi # imm = 0x1FF
-; AVX512VBMI-NEXT: movl %esi, %ecx
+; AVX512VBMI-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512VBMI-NEXT: subq %rsi, %rcx
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
; AVX512VBMI-NEXT: shrl $6, %ecx
; AVX512VBMI-NEXT: movl $-1, %edx
; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm1 {%k1} {z}
-; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm1[1,2,3,4,5,6,7],zmm2[0]
-; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm4
-; AVX512VBMI-NEXT: vpshrdvq %zmm4, %zmm3, %zmm1
-; AVX512VBMI-NEXT: movq %rsi, %rcx
-; AVX512VBMI-NEXT: xorq $511, %rcx # imm = 0x1FF
-; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm3
-; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm3[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: movl %esi, %ecx
; AVX512VBMI-NEXT: shrl $6, %ecx
; AVX512VBMI-NEXT: shlxl %ecx, %edx, %ecx
; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm4 {%k1} {z}
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm4[0,1,2,3,4,5,6]
-; AVX512VBMI-NEXT: vpshldvq %zmm3, %zmm2, %zmm4
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm5 {%k1} {z}
+; AVX512VBMI-NEXT: vpshldvq %zmm1, %zmm4, %zmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm1 = zmm5[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm3
+; AVX512VBMI-NEXT: vpshrdvq %zmm3, %zmm1, %zmm5
; AVX512VBMI-NEXT: xorl %ecx, %ecx
; AVX512VBMI-NEXT: negq %rsi
; AVX512VBMI-NEXT: sbbl %ecx, %ecx
; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vporq %zmm1, %zmm4, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vporq %zmm5, %zmm2, %zmm0 {%k1}
; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
>From b373ff6c939303010da86fcb78ad73e7a57d98e9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 10 Mar 2026 13:17:13 +0000
Subject: [PATCH 3/4] Pull out repeated opcode check
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 11 +++++------
1 file changed, 5 insertions(+), 6 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f11c3c97be0fa..1d40e1b77e7d9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34537,6 +34537,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
!mayFoldIntoVector(Op1, DAG, Subtarget))
return;
+ bool IsFSHL = Opc == ISD::FSHL;
unsigned BW = VT.getSizeInBits();
MVT AmtVT = MVT::i64;
MVT VecVT = MVT::getVectorVT(MVT::i64, BW / 64);
@@ -34550,7 +34551,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Res = concatSubVectors(DAG.getBitcast(MVT::v4i64, Op1),
DAG.getBitcast(MVT::v4i64, Op0), DAG, dl);
Res = DAG.getBitcast(MVT::i512, Res);
- if (Opc == ISD::FSHL) {
+ if (IsFSHL) {
Res = DAG.getNode(ISD::SHL, dl, MVT::i512, Res, Amt);
Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res,
DAG.getShiftAmountConstant(256, MVT::i512, dl));
@@ -34568,14 +34569,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Sel = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, AmtZ);
SDValue InvAmt =
DAG.getNode(ISD::SUB, dl, AmtVT, DAG.getConstant(BW, dl, AmtVT), Amt);
- SDValue ShX =
- DAG.getNode(ISD::SHL, dl, VT, Op0, Opc == ISD::FSHL ? Amt : InvAmt);
- SDValue ShY =
- DAG.getNode(ISD::SRL, dl, VT, Op1, Opc == ISD::FSHR ? Amt : InvAmt);
+ SDValue ShX = DAG.getNode(ISD::SHL, dl, VT, Op0, IsFSHL ? Amt : InvAmt);
+ SDValue ShY = DAG.getNode(ISD::SRL, dl, VT, Op1, IsFSHL ? InvAmt : Amt);
SDValue Res = DAG.getNode(ISD::OR, dl, VecVT, DAG.getBitcast(VecVT, ShX),
DAG.getBitcast(VecVT, ShY));
Res = DAG.getSelect(dl, VecVT, DAG.getBitcast(BoolVT, Sel), Res,
- DAG.getBitcast(VecVT, Opc == ISD::FSHL ? Op0 : Op1));
+ DAG.getBitcast(VecVT, IsFSHL ? Op0 : Op1));
Results.push_back(DAG.getBitcast(VT, Res));
return;
}
>From 10782fc6bcf3d210e48275d8d303f63feb06f7c5 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Tue, 10 Mar 2026 13:40:33 +0000
Subject: [PATCH 4/4] Don't use getShiftAmountConstant as 256 won't fit into a
i8 legalised shift amount type - consistently use the AmtVT
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 1d40e1b77e7d9..1734710977650 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34554,7 +34554,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (IsFSHL) {
Res = DAG.getNode(ISD::SHL, dl, MVT::i512, Res, Amt);
Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res,
- DAG.getShiftAmountConstant(256, MVT::i512, dl));
+ DAG.getConstant(256, dl, AmtVT));
} else {
Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res, Amt);
}
More information about the llvm-commits
mailing list