[llvm] [X86] Expand i512 shifts on AVX512 targets (PR #183198)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 5 03:55:43 PST 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
Cast to vXi64 and use EXPAND/COMPRESS to left/right shift the i64 elements into place, then use FSHL/FSHR to perform the final bitshift
Fixes: #<!-- -->178215
---
Patch is 367.82 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/183198.diff
3 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+110)
- (modified) llvm/test/CodeGen/X86/shift-i512.ll (+538-1357)
- (modified) llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll (+1574-2825)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 6bb558f4ef6da..bf1d36bf06a39 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1888,6 +1888,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::XOR, MVT::i512, Custom);
setOperationAction(ISD::ADD, MVT::i512, Custom);
setOperationAction(ISD::SUB, MVT::i512, Custom);
+ setOperationAction(ISD::SRL, MVT::i512, Custom);
+ setOperationAction(ISD::SHL, MVT::i512, Custom);
+ setOperationAction(ISD::SRA, MVT::i512, Custom);
setOperationAction(ISD::SELECT, MVT::i512, Custom);
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
@@ -2936,6 +2939,10 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
// Check for larger than legal scalar integer ops that might have been
// custom lowered to vector instruction.
switch (Opcode) {
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA:
+ return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget);
case ISD::AND:
case ISD::OR:
case ISD::XOR:
@@ -34431,6 +34438,92 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getBitcast(VT, Res));
return;
}
+ case ISD::SHL:
+ case ISD::SRL:
+ case ISD::SRA: {
+ EVT VT = N->getValueType(0);
+ SDValue Src = N->getOperand(0);
+ SDValue Amt = N->getOperand(1);
+ assert(Subtarget.useAVX512Regs() && "AVX512F required");
+ assert(VT == MVT::i512 && "Unexpected VT!");
+ MVT VecVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
+ MVT BoolVT = VecVT.changeVectorElementType(MVT::i1);
+
+ if (!mayFoldIntoVector(Src, DAG, Subtarget))
+ return;
+
+ // Early out if this will fold to a constant shift of whole byte elements.
+ // TODO: Directly lower to a shuffle?
+ if (auto *AmtC = dyn_cast<ConstantSDNode>(Amt)) {
+ assert(AmtC->getAPIntValue().ult(512) && "Out of bounds shift amount");
+ if (AmtC->getAPIntValue().urem(8) == 0)
+ return;
+ }
+
+ SDValue AmtLane = DAG.getNode(ISD::SRL, dl, MVT::i32,
+ DAG.getZExtOrTrunc(Amt, dl, MVT::i32),
+ DAG.getShiftAmountConstant(6, MVT::i32, dl));
+ AmtLane = DAG.getZExtOrTrunc(AmtLane, dl, MVT::i8);
+
+ if (auto *SrcC = dyn_cast<ConstantSDNode>(Src)) {
+ // Special case: SHL(1,Amt) --> SELECT(1<<(Amt/64), SPLAT(1<<(Amt%64)), 0)
+ if (Opc == ISD::SHL && SrcC->getAPIntValue() == 1) {
+ SDValue Bit = DAG.getConstant(1, dl, MVT::i64);
+ SDValue AmtMod = DAG.getNode(ISD::AND, dl, MVT::i64,
+ DAG.getZExtOrTrunc(Amt, dl, MVT::i64),
+ DAG.getConstant(63, dl, MVT::i64));
+ SDValue LaneMask = DAG.getNode(ISD::SHL, dl, MVT::i64, Bit, AmtLane);
+ LaneMask =
+ DAG.getBitcast(BoolVT, DAG.getZExtOrTrunc(LaneMask, dl, MVT::i8));
+ SDValue Elt = DAG.getNode(ISD::SHL, dl, MVT::i64, Bit, AmtMod);
+ SDValue Res =
+ DAG.getSelect(dl, VecVT, LaneMask, DAG.getSplat(VecVT, dl, Elt),
+ DAG.getConstant(0, dl, VecVT));
+ Results.push_back(DAG.getBitcast(VT, Res));
+ return;
+ }
+ }
+
+ // Use EXPAND/COMPRESS to shuffle the i64 elements left/right with the
+ // ShiftAmt/64 'laneshift', and then shuffle one element along to get the
+ // shifted in bits from the neighbouring element. Finally use a funnel shift
+ // with the ShiftAmt%64 'elementshift' to get the final result.
+ SDValue Mask =
+ DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+ DAG.getNode(ISD::SHL, dl, MVT::i32,
+ DAG.getAllOnesConstant(dl, MVT::i32), AmtLane));
+ Src = DAG.getBitcast(VecVT, Src);
+
+ SDValue PassThrough;
+ if (Opc == ISD::SRA) {
+ // Splat the MSB sign bit across the vector.
+ PassThrough = DAG.getNode(ISD::SRA, dl, VecVT, Src,
+ DAG.getShiftAmountConstant(63, VecVT, dl));
+ PassThrough = DAG.getVectorShuffle(VecVT, dl, PassThrough, PassThrough,
+ {7, 7, 7, 7, 7, 7, 7, 7});
+ } else {
+ PassThrough = DAG.getConstant(0, dl, VecVT);
+ }
+ SDValue A, B;
+ if (Opc == ISD::SHL) {
+ A = DAG.getNode(X86ISD::EXPAND, dl, VecVT, Src, PassThrough,
+ DAG.getBitcast(BoolVT, Mask));
+ B = DAG.getVectorShuffle(VecVT, dl, PassThrough, A,
+ {7, 8, 9, 10, 11, 12, 13, 14});
+ } else {
+ B = DAG.getNode(X86ISD::COMPRESS, dl, VecVT, Src, PassThrough,
+ DAG.getBitcast(BoolVT, Mask));
+ A = DAG.getVectorShuffle(VecVT, dl, B, PassThrough,
+ {1, 2, 3, 4, 5, 6, 7, 8});
+ }
+ // Funnel shifts use modulo shift amount so no need to explicitly mask it.
+ SDValue Res =
+ DAG.getNode(Opc == ISD::SHL ? ISD::FSHL : ISD::FSHR, dl, VecVT, A, B,
+ DAG.getSplatBuildVector(
+ VecVT, dl, DAG.getZExtOrTrunc(Amt, dl, MVT::i64)));
+ Results.push_back(DAG.getBitcast(VT, Res));
+ return;
+ }
case ISD::CTPOP: {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
@@ -48076,6 +48169,23 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
dl, DAG, DCI))
return V;
+ // Scalarize single use funnel shift.
+ // Ideally DAG would handle this similar to scalarizeExtractedBinOp.
+ if (InputVector.getOpcode() == ISD::FSHL ||
+ InputVector.getOpcode() == ISD::FSHR) {
+ if (CIdx && InputVector.hasOneUse() &&
+ TLI.isOperationLegal(InputVector.getOpcode(), VT)) {
+ SDValue LHS = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(0),
+ CIdx->getZExtValue());
+ SDValue RHS = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(1),
+ CIdx->getZExtValue());
+ SDValue Amt = DAG.getExtractVectorElt(dl, VT, InputVector.getOperand(2),
+ CIdx->getZExtValue());
+ Amt = DAG.getShiftAmountOperand(VT, Amt);
+ return DAG.getNode(InputVector.getOpcode(), dl, VT, LHS, RHS, Amt);
+ }
+ }
+
// Attempt to extract a i1 element by using MOVMSK to extract the signbits
// and then testing the relevant element.
//
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index f60585e978104..c8ac18d1d309a 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -133,68 +133,42 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
;
; AVX512F-LABEL: shl_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: pushq %rax
-; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu %xmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
; AVX512F-NEXT: negl %eax
-; AVX512F-NEXT: movslq %eax, %r8
-; AVX512F-NEXT: movq -56(%rsp,%r8), %rdx
-; AVX512F-NEXT: movq -48(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -40(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %r9
-; AVX512F-NEXT: shldq %cl, %rax, %r9
-; AVX512F-NEXT: movq -32(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r11
-; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq -24(%rsp,%r8), %r10
-; AVX512F-NEXT: movq %r10, %rbx
-; AVX512F-NEXT: shldq %cl, %rax, %rbx
-; AVX512F-NEXT: movq -16(%rsp,%r8), %rax
-; AVX512F-NEXT: movq %rax, %r14
-; AVX512F-NEXT: shldq %cl, %r10, %r14
-; AVX512F-NEXT: movq -8(%rsp,%r8), %r10
-; AVX512F-NEXT: shldq %cl, %rax, %r10
+; AVX512F-NEXT: cltq
+; AVX512F-NEXT: vmovdqu64 -64(%rsp,%rax), %zmm2
+; AVX512F-NEXT: vpsllq %xmm1, %zmm2, %zmm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[7],zmm2[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq -64(%rsp,%r8), %rdi
-; AVX512F-NEXT: shlxq %rcx, %rdi, %r8
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdi, %rdx
-; AVX512F-NEXT: movq %r10, 56(%rax)
-; AVX512F-NEXT: movq %r14, 48(%rax)
-; AVX512F-NEXT: movq %rbx, 40(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %r9, 24(%rax)
-; AVX512F-NEXT: movq %rsi, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %r8, (%rax)
-; AVX512F-NEXT: addq $8, %rsp
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
+; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: shl_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: pushq %rax
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
@@ -210,53 +184,31 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
; AVX512VL-NEXT: shrl $3, %eax
; AVX512VL-NEXT: andl $56, %eax
; AVX512VL-NEXT: negl %eax
-; AVX512VL-NEXT: movslq %eax, %r9
-; AVX512VL-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VL-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VL-NEXT: movq %rax, %rsi
-; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VL-NEXT: movq %r10, %r8
-; AVX512VL-NEXT: shldq %cl, %rax, %r8
-; AVX512VL-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq %r11, %rbx
-; AVX512VL-NEXT: shldq %cl, %r10, %rbx
+; AVX512VL-NEXT: cltq
+; AVX512VL-NEXT: vmovdqu64 -64(%rsp,%rax), %zmm1
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm3[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpandnq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsrlq %xmm0, %zmm1, %zmm0
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VL-NEXT: movq %rdi, %r10
-; AVX512VL-NEXT: shldq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VL-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VL-NEXT: movq %r14, %r15
-; AVX512VL-NEXT: shldq %cl, %rdi, %r15
-; AVX512VL-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VL-NEXT: shldq %cl, %r14, %rdi
-; AVX512VL-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %r11, %rdx
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
-; AVX512VL-NEXT: movq %r15, 48(%rax)
-; AVX512VL-NEXT: movq %r10, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r8, 24(%rax)
-; AVX512VL-NEXT: movq %rsi, 16(%rax)
-; AVX512VL-NEXT: movq %rdx, 8(%rax)
-; AVX512VL-NEXT: movq %r9, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vporq %zmm0, %zmm2, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: popq %rcx
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: shl_i512:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512VBMI-NEXT: pushq %rax
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; AVX512VBMI-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VBMI-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX512VBMI-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
@@ -265,50 +217,23 @@ define i512 @shl_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VBMI-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
; AVX512VBMI-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %eax, %ecx
-; AVX512VBMI-NEXT: andl $63, %ecx
-; AVX512VBMI-NEXT: shrl $3, %eax
-; AVX512VBMI-NEXT: andl $56, %eax
-; AVX512VBMI-NEXT: negl %eax
-; AVX512VBMI-NEXT: movslq %eax, %r9
-; AVX512VBMI-NEXT: movq -56(%rsp,%r9), %rdx
-; AVX512VBMI-NEXT: movq -48(%rsp,%r9), %rax
-; AVX512VBMI-NEXT: movq %rax, %rsi
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq -40(%rsp,%r9), %r10
-; AVX512VBMI-NEXT: movq %r10, %r8
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
-; AVX512VBMI-NEXT: movq -32(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq %r11, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r10, %rbx
-; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq -24(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: movq %rdi, %r10
-; AVX512VBMI-NEXT: shldq %cl, %r11, %r10
-; AVX512VBMI-NEXT: movq -64(%rsp,%r9), %r11
-; AVX512VBMI-NEXT: movq -16(%rsp,%r9), %r14
-; AVX512VBMI-NEXT: movq %r14, %r15
-; AVX512VBMI-NEXT: shldq %cl, %rdi, %r15
-; AVX512VBMI-NEXT: movq -8(%rsp,%r9), %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r14, %rdi
-; AVX512VBMI-NEXT: shlxq %rcx, %r11, %r9
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT: movq %r15, 48(%rax)
-; AVX512VBMI-NEXT: movq %r10, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT: movq %r8, 24(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 16(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 8(%rax)
-; AVX512VBMI-NEXT: movq %r9, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT: movl %edi, %ecx
+; AVX512VBMI-NEXT: shrl $3, %ecx
+; AVX512VBMI-NEXT: andl $56, %ecx
+; AVX512VBMI-NEXT: negl %ecx
+; AVX512VBMI-NEXT: movslq %ecx, %rcx
+; AVX512VBMI-NEXT: vmovdqu64 -64(%rsp,%rcx), %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm0, %zmm2, %zmm1
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm1, 32(%rax)
+; AVX512VBMI-NEXT: vmovdqu %ymm1, (%rax)
+; AVX512VBMI-NEXT: popq %rcx
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%r = shl i512 %a0, %a1
@@ -428,16 +353,14 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
;
; AVX512F-LABEL: lshr_i512:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: pushq %rax
; AVX512F-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movq %r10, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX512F-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: vmovups %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512F-NEXT: movq %r9, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %r8, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
@@ -445,44 +368,25 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512F-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movl %eax, %ecx
; AVX512F-NEXT: andl $63, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
; AVX512F-NEXT: shrl $3, %eax
; AVX512F-NEXT: andl $56, %eax
-; AVX512F-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512F-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512F-NEXT: movq %r9, %rsi
-; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512F-NEXT: shrdq %cl, %r8, %rdx
-; AVX512F-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512F-NEXT: shrdq %cl, %r10, %r8
-; AVX512F-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512F-NEXT: shrdq %cl, %r11, %r10
-; AVX512F-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: movq -128(%rsp,%rax), %r14
-; AVX512F-NEXT: movq -72(%rsp,%rax), %r15
-; AVX512F-NEXT: shrdq %cl, %r15, %rbx
-; AVX512F-NEXT: shrdq %cl, %r9, %r14
+; AVX512F-NEXT: vmovdqu64 -128(%rsp,%rax), %zmm2
+; AVX512F-NEXT: vpsrlq %xmm0, %zmm2, %zmm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm2[1,2,3,4,5,6,7],zmm1[0]
+; AVX512F-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT: vpandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT: vpsllq %xmm0, %zmm1, %zmm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: shrxq %rcx, %r15, %rcx
-; AVX512F-NEXT: movq %rcx, 56(%rdi)
-; AVX512F-NEXT: movq %rbx, 48(%rdi)
-; AVX512F-NEXT: movq %r11, 40(%rdi)
-; AVX512F-NEXT: movq %r10, 32(%rdi)
-; AVX512F-NEXT: movq %r8, 24(%rdi)
-; AVX512F-NEXT: movq %rdx, 16(%rdi)
-; AVX512F-NEXT: movq %rsi, 8(%rdi)
-; AVX512F-NEXT: movq %r14, (%rdi)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: vporq %zmm3, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: popq %rcx
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: lshr_i512:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
+; AVX512VL-NEXT: pushq %rax
; AVX512VL-NEXT: vmovaps {{[0-9]+}}(%rsp), %xmm0
; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
@@ -498,91 +402,52 @@ define i512 @lshr_i512(i512 %a0, i512 %a1) nounwind {
; AVX512VL-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; AVX512VL-NEXT: movl %eax, %ecx
; AVX512VL-NEXT: andl $63, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
; AVX512VL-NEXT: shrl $3, %eax
; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: movq -112(%rsp,%rax), %rdx
-; AVX512VL-NEXT: movq -120(%rsp,%rax), %r9
-; AVX512VL-NEXT: movq %r9, %rsi
-; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq -104(%rsp,%rax), %r8
-; AVX512VL-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT: movq -96(%rsp,%rax), %r10
-; AVX512VL-NEXT: shrdq %cl, %r10, %r8
-; AVX512VL-NEXT: movq -88(%rsp,%rax), %r11
-; AVX512VL-NEXT: shrdq %cl, %r11, %r10
-; AVX512VL-NEXT: movq -80(%rsp,%rax), %rbx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: movq -72(%rsp,%rax), %r14
-...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/183198
More information about the llvm-commits
mailing list