[llvm] [X86] Enable i512 fshl/fshr lowering on avx512 targets (PR #185615)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 10 03:58:09 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
If freely foldable to vector, expand to a shift pair, along with zero-amount handling.
Unfortunately using the generic TargetLowering::expandFunnelShift results in additional shifts/scalarizations instead of staying on the FPU.
Fixes #<!-- -->184828
---
Patch is 96.46 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/185615.diff
2 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+40-13)
- (modified) llvm/test/CodeGen/X86/funnel-shift-i512.ll (+702-1470)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 328e79eb07ca5..783a967d0eb5a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1891,6 +1891,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRL, MVT::i512, Custom);
setOperationAction(ISD::SHL, MVT::i512, Custom);
setOperationAction(ISD::SRA, MVT::i512, Custom);
+ setOperationAction(ISD::FSHR, MVT::i512, Custom);
+ setOperationAction(ISD::FSHL, MVT::i512, Custom);
setOperationAction(ISD::FSHR, MVT::i256, Custom);
setOperationAction(ISD::FSHL, MVT::i256, Custom);
setOperationAction(ISD::SELECT, MVT::i512, Custom);
@@ -34530,26 +34532,51 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue Op1 = N->getOperand(1);
SDValue Amt = N->getOperand(2);
assert(Subtarget.useAVX512Regs() && "AVX512F required");
- assert(VT == MVT::i256 && "Unexpected VT!");
+ assert((VT == MVT::i256 || VT == MVT::i512) && "Unexpected VT!");
if (!mayFoldIntoVector(Op0, DAG, Subtarget) ||
!mayFoldIntoVector(Op1, DAG, Subtarget))
return;
+ unsigned BW = VT.getSizeInBits();
+ MVT AmtVT = MVT::i64;
+ MVT VecVT = MVT::getVectorVT(MVT::i64, BW / 64);
+ MVT BoolVT = MVT::getVectorVT(MVT::i1, BW / 64);
+ Amt = DAG.getNode(ISD::AND, dl, AmtVT, DAG.getZExtOrTrunc(Amt, dl, AmtVT),
+ DAG.getConstant(BW - 1, dl, AmtVT));
+
// fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
// fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
- SDValue Res = DAG.getBitcast(
- MVT::i512, concatSubVectors(DAG.getBitcast(MVT::v4i64, Op1),
- DAG.getBitcast(MVT::v4i64, Op0), DAG, dl));
- Amt = DAG.getNode(ISD::AND, dl, Amt.getValueType(), Amt,
- DAG.getConstant(255, dl, Amt.getValueType()));
- if (Opc == ISD::FSHL) {
- Res = DAG.getNode(ISD::SHL, dl, MVT::i512, Res, Amt);
- Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res,
- DAG.getShiftAmountConstant(256, MVT::i512, dl));
- } else {
- Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res, Amt);
+ if (VT == MVT::i256) {
+ SDValue Res = concatSubVectors(DAG.getBitcast(MVT::v4i64, Op1),
+ DAG.getBitcast(MVT::v4i64, Op0), DAG, dl);
+ Res = DAG.getBitcast(MVT::i512, Res);
+ if (Opc == ISD::FSHL) {
+ Res = DAG.getNode(ISD::SHL, dl, MVT::i512, Res, Amt);
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res,
+ DAG.getShiftAmountConstant(256, MVT::i512, dl));
+ } else {
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res, Amt);
+ }
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i256, Res));
+ return;
}
- Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i256, Res));
+
+ // fshl: z == 0 ? x : (x << (z % bw) | y >> (bw - (z % bw)))
+ // fshr: z == 0 ? y : (x << (bw - (z % bw)) | y >> (z % bw))
+ SDValue AmtZ = DAG.getSetCC(dl, MVT::i1, Amt, DAG.getConstant(0, dl, AmtVT),
+ ISD::SETNE);
+ SDValue Sel = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i8, AmtZ);
+ SDValue InvAmt = DAG.getNode(ISD::SUB, dl, AmtVT,
+ DAG.getConstant(BW - 1, dl, AmtVT), Amt);
+ SDValue ShX =
+ DAG.getNode(ISD::SHL, dl, VT, Op0, Opc == ISD::FSHL ? Amt : InvAmt);
+ SDValue ShY =
+ DAG.getNode(ISD::SRL, dl, VT, Op1, Opc == ISD::FSHR ? Amt : InvAmt);
+ SDValue Res = DAG.getNode(ISD::OR, dl, VecVT, DAG.getBitcast(VecVT, ShX),
+ DAG.getBitcast(VecVT, ShY));
+ Res = DAG.getSelect(dl, VecVT, DAG.getBitcast(BoolVT, Sel), Res,
+ DAG.getBitcast(VecVT, Opc == ISD::FSHL ? Op0 : Op1));
+ Results.push_back(DAG.getBitcast(VT, Res));
return;
}
case ISD::CTPOP: {
diff --git a/llvm/test/CodeGen/X86/funnel-shift-i512.ll b/llvm/test/CodeGen/X86/funnel-shift-i512.ll
index af48bff3cdff7..35c67a696dfef 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-i512.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-i512.ll
@@ -1837,263 +1837,158 @@ define i512 @fshl_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshl_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq (%rsi), %r11
-; AVX512F-NEXT: movq 8(%rsi), %r10
-; AVX512F-NEXT: movq 16(%rsi), %r9
-; AVX512F-NEXT: movq 24(%rsi), %r8
-; AVX512F-NEXT: movq 32(%rdx), %r14
-; AVX512F-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT: movq 56(%rsi), %rax
-; AVX512F-NEXT: cmovneq %r8, %rax
-; AVX512F-NEXT: movq 40(%rdx), %r13
-; AVX512F-NEXT: movq 48(%rsi), %rbp
-; AVX512F-NEXT: cmovneq %r9, %rbp
-; AVX512F-NEXT: movq 48(%rdx), %r15
-; AVX512F-NEXT: movq 40(%rsi), %rbx
-; AVX512F-NEXT: cmovneq %r10, %rbx
-; AVX512F-NEXT: movq 56(%rdx), %r12
-; AVX512F-NEXT: movq 32(%rsi), %rsi
-; AVX512F-NEXT: cmovneq %r11, %rsi
-; AVX512F-NEXT: cmovneq %r12, %r8
-; AVX512F-NEXT: cmovneq %r15, %r9
-; AVX512F-NEXT: cmovneq %r13, %r10
-; AVX512F-NEXT: cmovneq %r14, %r11
-; AVX512F-NEXT: cmovneq 16(%rdx), %r15
-; AVX512F-NEXT: cmovneq (%rdx), %r14
-; AVX512F-NEXT: cmovneq 8(%rdx), %r13
-; AVX512F-NEXT: cmovneq 24(%rdx), %r12
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: cmoveq %r12, %r13
-; AVX512F-NEXT: cmoveq %r15, %r14
-; AVX512F-NEXT: cmoveq %r11, %r15
-; AVX512F-NEXT: cmoveq %r10, %r12
-; AVX512F-NEXT: cmoveq %r9, %r11
-; AVX512F-NEXT: cmoveq %r8, %r10
-; AVX512F-NEXT: cmoveq %rsi, %r9
-; AVX512F-NEXT: cmoveq %rbx, %r8
-; AVX512F-NEXT: cmoveq %rbp, %rsi
-; AVX512F-NEXT: cmoveq %rax, %rbx
+; AVX512F-NEXT: subq $136, %rsp
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vmovups (%rdx), %zmm1
+; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512F-NEXT: vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu64 %zmm2, {{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovups %zmm1, (%rsp)
+; AVX512F-NEXT: movl %ecx, %eax
+; AVX512F-NEXT: andl $63, %eax
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512F-NEXT: vpandn %xmm3, %xmm1, %xmm4
+; AVX512F-NEXT: movl %ecx, %eax
+; AVX512F-NEXT: shrl $3, %eax
+; AVX512F-NEXT: andl $56, %eax
+; AVX512F-NEXT: leaq -{{[0-9]+}}(%rsp), %rdx
+; AVX512F-NEXT: subq %rax, %rdx
+; AVX512F-NEXT: vmovdqu64 (%rdx), %zmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm6 = zmm2[7],zmm5[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm6, %zmm6
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpsllq %xmm1, %zmm5, %zmm1
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: cmovneq %rsi, %rbx
-; AVX512F-NEXT: cmovneq %r8, %rsi
-; AVX512F-NEXT: cmovneq %r9, %r8
-; AVX512F-NEXT: cmovneq %r10, %r9
-; AVX512F-NEXT: cmovneq %r11, %r10
-; AVX512F-NEXT: cmovneq %r12, %r11
-; AVX512F-NEXT: cmovneq %r15, %r12
-; AVX512F-NEXT: cmoveq %r13, %r14
-; AVX512F-NEXT: cmovneq %r13, %r15
-; AVX512F-NEXT: movq %r15, %rdx
-; AVX512F-NEXT: shldq %cl, %r14, %rdx
-; AVX512F-NEXT: movq %r12, %rdi
-; AVX512F-NEXT: shldq %cl, %r15, %rdi
-; AVX512F-NEXT: movq %r11, %r14
-; AVX512F-NEXT: shldq %cl, %r12, %r14
-; AVX512F-NEXT: movq %r10, %r15
-; AVX512F-NEXT: shldq %cl, %r11, %r15
-; AVX512F-NEXT: movq %r9, %r11
-; AVX512F-NEXT: shldq %cl, %r10, %r11
-; AVX512F-NEXT: movq %r8, %r10
-; AVX512F-NEXT: shldq %cl, %r9, %r10
-; AVX512F-NEXT: movq %rsi, %r9
-; AVX512F-NEXT: shldq %cl, %r8, %r9
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rsi, %rbx
-; AVX512F-NEXT: movq %rbx, 56(%rax)
-; AVX512F-NEXT: movq %r9, 48(%rax)
-; AVX512F-NEXT: movq %r10, 40(%rax)
-; AVX512F-NEXT: movq %r11, 32(%rax)
-; AVX512F-NEXT: movq %r15, 24(%rax)
-; AVX512F-NEXT: movq %r14, 16(%rax)
-; AVX512F-NEXT: movq %rdi, 8(%rax)
-; AVX512F-NEXT: movq %rdx, (%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vporq %zmm4, %zmm1, %zmm1
+; AVX512F-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512F-NEXT: movq %rcx, %rdx
+; AVX512F-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512F-NEXT: movl %edx, %esi
+; AVX512F-NEXT: andl $63, %esi
+; AVX512F-NEXT: vmovq %rsi, %xmm4
+; AVX512F-NEXT: vpbroadcastq %xmm4, %xmm4
+; AVX512F-NEXT: vpandn %xmm3, %xmm4, %xmm3
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: vmovdqu64 (%rsp,%rdx,8), %zmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm5[1,2,3,4,5,6,7],zmm2[0]
+; AVX512F-NEXT: vpaddq %zmm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq %xmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm5, %zmm3
+; AVX512F-NEXT: xorl %edx, %edx
+; AVX512F-NEXT: negq %rcx
+; AVX512F-NEXT: sbbl %edx, %edx
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: kmovw %edx, %k1
+; AVX512F-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: addq $136, %rsp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshl_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq (%rsi), %r11
-; AVX512VL-NEXT: movq 8(%rsi), %r9
-; AVX512VL-NEXT: movq 16(%rsi), %rax
-; AVX512VL-NEXT: movq 24(%rsi), %r8
-; AVX512VL-NEXT: movq 32(%rdx), %r15
-; AVX512VL-NEXT: movq 40(%rdx), %r12
-; AVX512VL-NEXT: movq 48(%rdx), %r14
-; AVX512VL-NEXT: movq 56(%rdx), %rbx
-; AVX512VL-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT: movq 56(%rsi), %r13
-; AVX512VL-NEXT: cmovneq %r8, %r13
-; AVX512VL-NEXT: movq 48(%rsi), %rbp
-; AVX512VL-NEXT: cmovneq %rax, %rbp
-; AVX512VL-NEXT: movq 40(%rsi), %r10
-; AVX512VL-NEXT: cmovneq %r9, %r10
-; AVX512VL-NEXT: movq 32(%rsi), %rsi
-; AVX512VL-NEXT: cmovneq %r11, %rsi
-; AVX512VL-NEXT: cmovneq %rbx, %r8
-; AVX512VL-NEXT: cmovneq %r14, %rax
-; AVX512VL-NEXT: cmovneq %r15, %r11
-; AVX512VL-NEXT: cmovneq 16(%rdx), %r14
-; AVX512VL-NEXT: cmovneq (%rdx), %r15
-; AVX512VL-NEXT: cmovneq %r12, %r9
-; AVX512VL-NEXT: cmovneq 8(%rdx), %r12
-; AVX512VL-NEXT: cmovneq 24(%rdx), %rbx
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: cmoveq %rbx, %r12
-; AVX512VL-NEXT: cmoveq %r14, %r15
-; AVX512VL-NEXT: cmoveq %r11, %r14
-; AVX512VL-NEXT: cmoveq %r9, %rbx
-; AVX512VL-NEXT: cmoveq %rax, %r11
-; AVX512VL-NEXT: cmoveq %r8, %r9
-; AVX512VL-NEXT: cmoveq %rsi, %rax
-; AVX512VL-NEXT: cmoveq %r10, %r8
-; AVX512VL-NEXT: cmoveq %rbp, %rsi
-; AVX512VL-NEXT: cmoveq %r13, %r10
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: cmovneq %rsi, %r10
-; AVX512VL-NEXT: cmovneq %r8, %rsi
-; AVX512VL-NEXT: cmovneq %rax, %r8
-; AVX512VL-NEXT: cmovneq %r9, %rax
-; AVX512VL-NEXT: cmovneq %r11, %r9
-; AVX512VL-NEXT: cmovneq %rbx, %r11
-; AVX512VL-NEXT: cmovneq %r14, %rbx
-; AVX512VL-NEXT: cmoveq %r12, %r15
-; AVX512VL-NEXT: cmovneq %r12, %r14
-; AVX512VL-NEXT: movq %r14, %rdx
-; AVX512VL-NEXT: shldq %cl, %r15, %rdx
-; AVX512VL-NEXT: movq %rbx, %r15
-; AVX512VL-NEXT: shldq %cl, %r14, %r15
-; AVX512VL-NEXT: movq %r11, %r14
-; AVX512VL-NEXT: shldq %cl, %rbx, %r14
-; AVX512VL-NEXT: movq %r9, %rbx
-; AVX512VL-NEXT: shldq %cl, %r11, %rbx
-; AVX512VL-NEXT: movq %rax, %r11
-; AVX512VL-NEXT: shldq %cl, %r9, %r11
-; AVX512VL-NEXT: movq %r8, %r9
-; AVX512VL-NEXT: shldq %cl, %rax, %r9
+; AVX512VL-NEXT: subq $136, %rsp
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
+; AVX512VL-NEXT: vmovups (%rdx), %ymm2
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %rsi, %rdi
-; AVX512VL-NEXT: shldq %cl, %r8, %rdi
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %rsi, %r10
-; AVX512VL-NEXT: movq %r10, 56(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %r9, 40(%rax)
-; AVX512VL-NEXT: movq %r11, 32(%rax)
-; AVX512VL-NEXT: movq %rbx, 24(%rax)
-; AVX512VL-NEXT: movq %r14, 16(%rax)
-; AVX512VL-NEXT: movq %r15, 8(%rax)
-; AVX512VL-NEXT: movq %rdx, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: vmovups 32(%rdx), %ymm3
+; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovups %ymm2, (%rsp)
+; AVX512VL-NEXT: movl %ecx, %edx
+; AVX512VL-NEXT: andl $63, %edx
+; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT: movl %ecx, %edx
+; AVX512VL-NEXT: shrl $3, %edx
+; AVX512VL-NEXT: andl $56, %edx
+; AVX512VL-NEXT: leaq -{{[0-9]+}}(%rsp), %rsi
+; AVX512VL-NEXT: subq %rdx, %rsi
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm3
+; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm4
+; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: valignq {{.*#+}} zmm6 = zmm5[7],zmm3[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm6, %zmm6
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: vporq %zmm4, %zmm1, %zmm1
+; AVX512VL-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512VL-NEXT: movq %rcx, %rdx
+; AVX512VL-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512VL-NEXT: movl %edx, %esi
+; AVX512VL-NEXT: andl $63, %esi
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm3
+; AVX512VL-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: vmovdqu64 (%rsp,%rdx,8), %zmm4
+; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT: vpaddq %zmm5, %zmm5, %zmm5
+; AVX512VL-NEXT: vpsllq %xmm2, %zmm5, %zmm2
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: negq %rcx
+; AVX512VL-NEXT: sbbl %edx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT: addq $136, %rsp
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshl_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rbp
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq (%rsi), %r11
-; AVX512VBMI-NEXT: movq 8(%rsi), %r9
-; AVX512VBMI-NEXT: movq 16(%rsi), %rax
-; AVX512VBMI-NEXT: movq 24(%rsi), %r8
-; AVX512VBMI-NEXT: movq 32(%rdx), %r15
-; AVX512VBMI-NEXT: movq 40(%rdx), %r12
-; AVX512VBMI-NEXT: movq 48(%rdx), %r14
-; AVX512VBMI-NEXT: movq 56(%rdx), %rbx
-; AVX512VBMI-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT: movq 56(%rsi), %r13
-; AVX512VBMI-NEXT: cmovneq %r8, %r13
-; AVX512VBMI-NEXT: movq 48(%rsi), %rbp
-; AVX512VBMI-NEXT: cmovneq %rax, %rbp
-; AVX512VBMI-NEXT: movq 40(%rsi), %r10
-; AVX512VBMI-NEXT: cmovneq %r9, %r10
-; AVX512VBMI-NEXT: movq 32(%rsi), %rsi
-; AVX512VBMI-NEXT: cmovneq %r11, %rsi
-; AVX512VBMI-NEXT: cmovneq %rbx, %r8
-; AVX512VBMI-NEXT: cmovneq %r14, %rax
-; AVX512VBMI-NEXT: cmovneq %r15, %r11
-; AVX512VBMI-NEXT: cmovneq 16(%rdx), %r14
-; AVX512VBMI-NEXT: cmovneq (%rdx), %r15
-; AVX512VBMI-NEXT: cmovneq %r12, %r9
-; AVX512VBMI-NEXT: cmovneq 8(%rdx), %r12
-; AVX512VBMI-NEXT: cmovneq 24(%rdx), %rbx
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: cmoveq %rbx, %r12
-; AVX512VBMI-NEXT: cmoveq %r14, %r15
-; AVX512VBMI-NEXT: cmoveq %r11, %r14
-; AVX512VBMI-NEXT: cmoveq %r9, %rbx
-; AVX512VBMI-NEXT: cmoveq %rax, %r11
-; AVX512VBMI-NEXT: cmoveq %r8, %r9
-; AVX512VBMI-NEXT: cmoveq %rsi, %rax
-; AVX512VBMI-NEXT: cmoveq %r10, %r8
-; AVX512VBMI-NEXT: cmoveq %rbp, %rsi
-; AVX512VBMI-NEXT: cmoveq %r13, %r10
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: cmovneq %rsi, %r10
-; AVX512VBMI-NEXT: cmovneq %r8, %rsi
-; AVX512VBMI-NEXT: cmovneq %rax, %r8
-; AVX512VBMI-NEXT: cmovneq %r9, %rax
-; AVX512VBMI-NEXT: cmovneq %r11, %r9
-; AVX512VBMI-NEXT: cmovneq %rbx, %r11
-; AVX512VBMI-NEXT: cmovneq %r14, %rbx
-; AVX512VBMI-NEXT: cmoveq %r12, %r15
-; AVX512VBMI-NEXT: cmovneq %r12, %r14
-; AVX512VBMI-NEXT: movq %r14, %rdx
-; AVX512VBMI-NEXT: shldq %cl, %r15, %rdx
-; AVX512VBMI-NEXT: movq %rbx, %r15
-; AVX512VBMI-NEXT: shldq %cl, %r14, %r15
-; AVX512VBMI-NEXT: movq %r11, %r14
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %r14
-; AVX512VBMI-NEXT: movq %r9, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rbx
-; AVX512VBMI-NEXT: movq %rax, %r11
-; AVX512VBMI-NEXT: shldq %cl, %r9, %r11
-; AVX512VBMI-NEXT: movq %r8, %r9
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r9
+; AVX512VBMI-NEXT: subq $136, %rsp
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
+; AVX512VBMI-NEXT: vmovdqu (%rdx), %ymm2
+; AVX512VBMI-NEXT: vmovups 32(%rdx), %ymm3
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq %rsi, %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r8, %rdi
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %rsi, %r10
-; AVX512VBMI-NEXT: movq %r10, 56(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 48(%rax)
-; AVX512VBMI-NEXT: movq %r9, 40(%rax)
-; AVX512VBMI-NEXT: movq %r11, 32(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 24(%rax)
-; AVX512VBMI-NEXT: movq %r14, 16(%rax)
-; AVX512VBMI-NEXT: movq %r15, 8(%rax)
-; AVX512VBMI-NEXT: movq %rdx, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
-; AVX512VBMI-NEXT: popq %rbp
+; AVX512VBMI-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; AVX512VBMI-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovdqu %ymm2, (%rsp)
+; AVX512VBMI-NEXT: movl %ecx, %edx
+; AVX512VBMI-NEXT: shrl $3, %edx
+; AVX512VBMI-NEXT: andl $56, %edx
+; AVX512VBMI-NEXT: leaq -{{[0-9]+}}(%rsp), %rsi
+; AVX512VBMI-NEXT: subq %rdx, %rsi
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm2[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm4
+; AVX512VBMI-NEXT: vpshldvq %zmm4, %zmm3, %zmm1
+; AVX512VBMI-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512VBMI-NEXT: movq %rcx, %rdx
+; AVX512VBMI-NEXT: xorq $511, %rdx # imm = 0x1FF
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm3
+; AVX512VBMI-NEXT: # kill: def $edx killed $edx killed $rdx def $rdx
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: vmovdqu64 (%rsp,%rdx,8), %zmm4
+; AVX512VBMI-NEXT: valignq {...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/185615
More information about the llvm-commits
mailing list