[llvm] [X86] Enable i256 fshl/fshr lowering on avx512 targets (PR #185455)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Mar 9 09:44:47 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Simon Pilgrim (RKSimon)
<details>
<summary>Changes</summary>
If freely foldable to vector, concat the i256 halves as v4i64 -> v8i64 and use the existing i512 shift lowering
Part of #<!-- -->184828
---
Patch is 44.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/185455.diff
2 Files Affected:
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+33)
- (modified) llvm/test/CodeGen/X86/funnel-shift-i256.ll (+334-580)
``````````diff
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 4ce343922a02e..328e79eb07ca5 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -1891,6 +1891,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRL, MVT::i512, Custom);
setOperationAction(ISD::SHL, MVT::i512, Custom);
setOperationAction(ISD::SRA, MVT::i512, Custom);
+ setOperationAction(ISD::FSHR, MVT::i256, Custom);
+ setOperationAction(ISD::FSHL, MVT::i256, Custom);
setOperationAction(ISD::SELECT, MVT::i512, Custom);
for (MVT VT : { MVT::v16i1, MVT::v16i8 }) {
@@ -2953,6 +2955,8 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
case ISD::XOR:
case ISD::ADD:
case ISD::SUB:
+ case ISD::FSHL:
+ case ISD::FSHR:
return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget) &&
mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget);
case ISD::SELECT:
@@ -34519,6 +34523,35 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getBitcast(VT, Res));
return;
}
+ case ISD::FSHL:
+ case ISD::FSHR: {
+ EVT VT = N->getValueType(0);
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ SDValue Amt = N->getOperand(2);
+ assert(Subtarget.useAVX512Regs() && "AVX512F required");
+ assert(VT == MVT::i256 && "Unexpected VT!");
+ if (!mayFoldIntoVector(Op0, DAG, Subtarget) ||
+ !mayFoldIntoVector(Op1, DAG, Subtarget))
+ return;
+
+ // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
+ // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
+ SDValue Res = DAG.getBitcast(
+ MVT::i512, concatSubVectors(DAG.getBitcast(MVT::v4i64, Op1),
+ DAG.getBitcast(MVT::v4i64, Op0), DAG, dl));
+ Amt = DAG.getNode(ISD::AND, dl, Amt.getValueType(), Amt,
+ DAG.getConstant(255, dl, Amt.getValueType()));
+ if (Opc == ISD::FSHL) {
+ Res = DAG.getNode(ISD::SHL, dl, MVT::i512, Res, Amt);
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res,
+ DAG.getShiftAmountConstant(256, MVT::i512, dl));
+ } else {
+ Res = DAG.getNode(ISD::SRL, dl, MVT::i512, Res, Amt);
+ }
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, MVT::i256, Res));
+ return;
+ }
case ISD::CTPOP: {
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
diff --git a/llvm/test/CodeGen/X86/funnel-shift-i256.ll b/llvm/test/CodeGen/X86/funnel-shift-i256.ll
index 7b787ab6a7a59..549b6e3fc0dd9 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-i256.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-i256.ll
@@ -466,112 +466,76 @@ define i256 @fshl_i256_load(ptr %p0, ptr %p1, i256 %a2) nounwind {
;
; AVX512F-LABEL: fshl_i256_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq (%rsi), %r9
-; AVX512F-NEXT: movq 8(%rsi), %r8
-; AVX512F-NEXT: movq 24(%rdx), %r10
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: movq 8(%rdx), %rbx
-; AVX512F-NEXT: cmoveq %r10, %rbx
-; AVX512F-NEXT: movq 16(%rdx), %r11
-; AVX512F-NEXT: movq (%rdx), %rdx
-; AVX512F-NEXT: cmoveq %r11, %rdx
-; AVX512F-NEXT: cmoveq %r9, %r11
-; AVX512F-NEXT: cmoveq %r8, %r10
-; AVX512F-NEXT: cmoveq 16(%rsi), %r9
-; AVX512F-NEXT: cmoveq 24(%rsi), %r8
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: cmovneq %r9, %r8
-; AVX512F-NEXT: cmovneq %r10, %r9
-; AVX512F-NEXT: cmovneq %r11, %r10
-; AVX512F-NEXT: cmoveq %rbx, %rdx
-; AVX512F-NEXT: cmovneq %rbx, %r11
-; AVX512F-NEXT: movq %r11, %rsi
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq %r10, %rdx
-; AVX512F-NEXT: shldq %cl, %r11, %rdx
-; AVX512F-NEXT: movq %r9, %rdi
-; AVX512F-NEXT: shldq %cl, %r10, %rdi
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %r9, %r8
-; AVX512F-NEXT: movq %r8, 24(%rax)
-; AVX512F-NEXT: movq %rdi, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %rsi, (%rax)
-; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: vmovdqu (%rdx), %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512F-NEXT: movzbl %cl, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: vpsllq %xmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshl_i256_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movq (%rsi), %rax
-; AVX512VL-NEXT: movq 8(%rsi), %r8
-; AVX512VL-NEXT: movq 16(%rdx), %r10
-; AVX512VL-NEXT: movq 24(%rdx), %r9
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: movq 8(%rdx), %r11
-; AVX512VL-NEXT: cmoveq %r9, %r11
-; AVX512VL-NEXT: movq (%rdx), %rdx
-; AVX512VL-NEXT: cmoveq %r10, %rdx
-; AVX512VL-NEXT: cmoveq %rax, %r10
-; AVX512VL-NEXT: cmoveq %r8, %r9
-; AVX512VL-NEXT: cmoveq 16(%rsi), %rax
-; AVX512VL-NEXT: cmoveq 24(%rsi), %r8
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: cmovneq %rax, %r8
-; AVX512VL-NEXT: cmovneq %r9, %rax
-; AVX512VL-NEXT: cmovneq %r10, %r9
-; AVX512VL-NEXT: cmoveq %r11, %rdx
-; AVX512VL-NEXT: cmovneq %r11, %r10
-; AVX512VL-NEXT: movq %r10, %rsi
-; AVX512VL-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VL-NEXT: movq %r9, %rdx
-; AVX512VL-NEXT: shldq %cl, %r10, %rdx
-; AVX512VL-NEXT: movq %rax, %r10
-; AVX512VL-NEXT: shldq %cl, %r9, %r10
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %rax, %r8
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %r8, 24(%rdi)
-; AVX512VL-NEXT: movq %r10, 16(%rdi)
-; AVX512VL-NEXT: movq %rdx, 8(%rdi)
-; AVX512VL-NEXT: movq %rsi, (%rdi)
+; AVX512VL-NEXT: vmovdqu (%rdx), %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VL-NEXT: movzbl %cl, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm1
+; AVX512VL-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm0, %zmm3
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm0, %zmm0
+; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshl_i256_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: movq (%rsi), %rax
-; AVX512VBMI-NEXT: movq 8(%rsi), %r8
-; AVX512VBMI-NEXT: movq 16(%rdx), %r10
-; AVX512VBMI-NEXT: movq 24(%rdx), %r9
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: movq 8(%rdx), %r11
-; AVX512VBMI-NEXT: cmoveq %r9, %r11
-; AVX512VBMI-NEXT: movq (%rdx), %rdx
-; AVX512VBMI-NEXT: cmoveq %r10, %rdx
-; AVX512VBMI-NEXT: cmoveq %rax, %r10
-; AVX512VBMI-NEXT: cmoveq %r8, %r9
-; AVX512VBMI-NEXT: cmoveq 16(%rsi), %rax
-; AVX512VBMI-NEXT: cmoveq 24(%rsi), %r8
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: cmovneq %rax, %r8
-; AVX512VBMI-NEXT: cmovneq %r9, %rax
-; AVX512VBMI-NEXT: cmovneq %r10, %r9
-; AVX512VBMI-NEXT: cmoveq %r11, %rdx
-; AVX512VBMI-NEXT: cmovneq %r11, %r10
-; AVX512VBMI-NEXT: movq %r10, %rsi
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: movq %r9, %rdx
-; AVX512VBMI-NEXT: shldq %cl, %r10, %rdx
-; AVX512VBMI-NEXT: movq %rax, %r10
-; AVX512VBMI-NEXT: shldq %cl, %r9, %r10
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r8
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 16(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, (%rdi)
+; AVX512VBMI-NEXT: vmovdqu (%rdx), %ymm0
+; AVX512VBMI-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VBMI-NEXT: movzbl %cl, %ecx
+; AVX512VBMI-NEXT: movl %ecx, %edx
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: movl $-1, %esi
+; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm0[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, (%rdi)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i256, ptr %p0
%a1 = load i256, ptr %p1
@@ -650,103 +614,72 @@ define i256 @fshr_i256_load(ptr %p0, ptr %p1, i256 %a2) nounwind {
;
; AVX512F-LABEL: fshr_i256_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq (%rsi), %r8
-; AVX512F-NEXT: movq 8(%rsi), %r10
-; AVX512F-NEXT: movq 16(%rdx), %r9
-; AVX512F-NEXT: testb %cl, %cl
-; AVX512F-NEXT: movq (%rdx), %rbx
-; AVX512F-NEXT: cmovsq %r9, %rbx
-; AVX512F-NEXT: movq 24(%rdx), %r11
-; AVX512F-NEXT: movq 8(%rdx), %rdx
-; AVX512F-NEXT: cmovsq %r11, %rdx
-; AVX512F-NEXT: cmovsq %r8, %r9
-; AVX512F-NEXT: cmovsq %r10, %r11
-; AVX512F-NEXT: cmovsq 16(%rsi), %r8
-; AVX512F-NEXT: cmovsq 24(%rsi), %r10
+; AVX512F-NEXT: vmovdqu (%rdx), %ymm0
+; AVX512F-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512F-NEXT: movzbl %cl, %eax
+; AVX512F-NEXT: vmovq %rax, %xmm1
+; AVX512F-NEXT: # kill: def $eax killed $eax killed $rax
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %eax, %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: vpsrlq %xmm3, %ymm0, %ymm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0]
+; AVX512F-NEXT: vpaddq %ymm0, %ymm0, %ymm0
+; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpsllq %xmm1, %ymm0, %ymm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: cmoveq %r8, %r10
-; AVX512F-NEXT: cmoveq %r11, %r8
-; AVX512F-NEXT: cmoveq %r9, %r11
-; AVX512F-NEXT: cmoveq %rdx, %r9
-; AVX512F-NEXT: cmoveq %rbx, %rdx
-; AVX512F-NEXT: shrdq %cl, %r9, %rdx
-; AVX512F-NEXT: shrdq %cl, %r11, %r9
-; AVX512F-NEXT: shrdq %cl, %r8, %r11
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shrdq %cl, %r10, %r8
-; AVX512F-NEXT: movq %r8, 24(%rdi)
-; AVX512F-NEXT: movq %r11, 16(%rdi)
-; AVX512F-NEXT: movq %r9, 8(%rdi)
-; AVX512F-NEXT: movq %rdx, (%rdi)
-; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshr_i256_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movq (%rsi), %r8
-; AVX512VL-NEXT: movq 8(%rsi), %rax
-; AVX512VL-NEXT: movq 16(%rdx), %r9
-; AVX512VL-NEXT: movq 24(%rdx), %r10
-; AVX512VL-NEXT: testb %cl, %cl
-; AVX512VL-NEXT: movq (%rdx), %r11
-; AVX512VL-NEXT: cmovsq %r9, %r11
-; AVX512VL-NEXT: movq 8(%rdx), %rdx
-; AVX512VL-NEXT: cmovsq %r10, %rdx
-; AVX512VL-NEXT: cmovsq %r8, %r9
-; AVX512VL-NEXT: cmovsq %rax, %r10
-; AVX512VL-NEXT: cmovsq 16(%rsi), %r8
-; AVX512VL-NEXT: cmovsq 24(%rsi), %rax
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: cmoveq %r8, %rax
-; AVX512VL-NEXT: cmoveq %r10, %r8
-; AVX512VL-NEXT: cmoveq %r9, %r10
-; AVX512VL-NEXT: cmoveq %rdx, %r9
-; AVX512VL-NEXT: cmoveq %r11, %rdx
-; AVX512VL-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VL-NEXT: shrdq %cl, %r10, %r9
-; AVX512VL-NEXT: shrdq %cl, %r8, %r10
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shrdq %cl, %rax, %r8
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %r8, 24(%rdi)
-; AVX512VL-NEXT: movq %r10, 16(%rdi)
-; AVX512VL-NEXT: movq %r9, 8(%rdi)
-; AVX512VL-NEXT: movq %rdx, (%rdi)
+; AVX512VL-NEXT: vmovdqu (%rdx), %ymm0
+; AVX512VL-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VL-NEXT: movzbl %cl, %ecx
+; AVX512VL-NEXT: movl %ecx, %edx
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: movl $-1, %esi
+; AVX512VL-NEXT: shlxl %edx, %esi, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm2
+; AVX512VL-NEXT: vpand %xmm1, %xmm2, %xmm3
+; AVX512VL-NEXT: vpsrlq %xmm3, %ymm0, %ymm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm0 = zmm0[1,2,3,4,5,6,7,0]
+; AVX512VL-NEXT: vpaddq %ymm0, %ymm0, %ymm0
+; AVX512VL-NEXT: vpandn %xmm1, %xmm2, %xmm1
+; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshr_i256_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: movq (%rsi), %r8
-; AVX512VBMI-NEXT: movq 8(%rsi), %rax
-; AVX512VBMI-NEXT: movq 16(%rdx), %r9
-; AVX512VBMI-NEXT: movq 24(%rdx), %r10
-; AVX512VBMI-NEXT: testb %cl, %cl
-; AVX512VBMI-NEXT: movq (%rdx), %r11
-; AVX512VBMI-NEXT: cmovsq %r9, %r11
-; AVX512VBMI-NEXT: movq 8(%rdx), %rdx
-; AVX512VBMI-NEXT: cmovsq %r10, %rdx
-; AVX512VBMI-NEXT: cmovsq %r8, %r9
-; AVX512VBMI-NEXT: cmovsq %rax, %r10
-; AVX512VBMI-NEXT: cmovsq 16(%rsi), %r8
-; AVX512VBMI-NEXT: cmovsq 24(%rsi), %rax
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: cmoveq %r8, %rax
-; AVX512VBMI-NEXT: cmoveq %r10, %r8
-; AVX512VBMI-NEXT: cmoveq %r9, %r10
-; AVX512VBMI-NEXT: cmoveq %rdx, %r9
-; AVX512VBMI-NEXT: cmoveq %r11, %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %r9
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %r10
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %r8
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 8(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, (%rdi)
+; AVX512VBMI-NEXT: vmovdqu (%rdx), %ymm0
+; AVX512VBMI-NEXT: vinserti64x4 $1, (%rsi), %zmm0, %zmm0
+; AVX512VBMI-NEXT: movzbl %cl, %ecx
+; AVX512VBMI-NEXT: movl %ecx, %edx
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: movl $-1, %esi
+; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i256, ptr %p0
%a1 = load i256, ptr %p1
@@ -1235,132 +1168,75 @@ define i256 @fshl_i256_vector(<4 x i64> %v0, <4 x i64> %v1, i256 %a2) nounwind {
;
; AVX512F-LABEL: fshl_i256_vector:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %rsi, %rcx
-; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512F-NEXT: vmovq %xmm2, %rsi
-; AVX512F-NEXT: vpextrq $1, %xmm0, %rax
-; AVX512F-NEXT: vmovq %xmm0, %r11
-; AVX512F-NEXT: vmovq %xmm1, %r9
-; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512F-NEXT: vpextrq $1, %xmm1, %rbx
-; AVX512F-NEXT: vmovq %xmm0, %r10
-; AVX512F-NEXT: vpextrq $1, %xmm0, %r8
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: cmoveq %r8, %rbx
-; AVX512F-NEXT: cmoveq %r10, %r9
-; AVX512F-NEXT: cmoveq %r11, %r10
-; AVX512F-NEXT: cmoveq %rax, %r8
-; AVX512F-NEXT: cmovneq %r11, %rsi
-; AVX512F-NEXT: cmovneq %rax, %rdx
+; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: cmovneq %rsi, %rdx
-; AVX512F-NEXT: cmovneq %r8, %rsi
-; AVX512F-NEXT: cmovneq %r10, %r8
-; AVX512F-NEXT: cmoveq %rbx, %r9
-; AVX512F-NEXT: cmovneq %rbx, %r10
-; AVX512F-NEXT: movq %r10, %rdi
-; AVX512F-NEXT: shldq %cl, %r9, %rdi
-; AVX512F-NEXT: movq %r8, %r9
-; AVX512F-NEXT: shldq %cl, %r10, %r9
-; AVX512F-NEXT: movq %rsi, %r10
-; AVX512F-NEXT: shldq %cl, %r8, %r10
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rsi, %rdx
-; AVX512F-NEXT: movq %rdx, 24(%rax)
-; AVX512F-NEXT: movq %r10, 16(%rax)
-; AVX512F-NEXT: movq %r9, 8(%rax)
-; AVX512F-NEXT: movq %rdi, (%rax)
-; AVX512F-NEXT: popq %rbx
+; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512F-NEXT: movzbl %sil, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm1
+; AVX512F-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512F-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512F-NEXT: vpsllq %xmm3, %zmm0, %zmm3
+; AVX512F-NEXT: vextracti64x4 $1, %zmm3, %ymm3
+; AVX512F-NEXT: valignq {{.*#+}} zmm0 = zmm0[7,0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm0, %zmm0
+; AVX512F-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; AVX512F-NEXT: vpsrlq %xmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vpor %ymm0, %ymm3, %ymm0
+; AVX512F-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshl_i256_vector:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq %rsi, %rcx
-; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX512VL-NEXT: vpextrq $1, %xmm2, %rdx
-; AVX512VL-NEXT: vmovq %xmm2, %rax
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %r10
-; AVX512VL-NEXT: vmovq %xmm0, %r11
-; AVX512VL-NEXT: vmovq %xmm1, %r8
-; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm0
-; AVX512VL-NEXT: vpextrq $1, %xmm1, %rbx
-; AVX512VL-NEXT: vpextrq $1, %xmm0, %rsi
-; AVX512VL-NEXT: vmovq %xmm0, %r9
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: cmoveq %rsi, %rbx
-; AVX512VL-NEXT: cmoveq %r9, %r8
-; AVX512VL-NEXT: cmoveq %r11, %r9
-; AVX512VL-NEXT: cmoveq %r10, %rsi
-; AVX512VL-NEXT: cmovneq %r11, %rax
-; AVX512VL-NEXT: cmovneq %r10, %rdx
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: cmovneq %rax, %rdx
-; AVX512VL-NEXT: cmovneq %rsi, %rax
-; AVX512VL-NEXT: cmovneq %r9, %rsi
-; AVX512VL-NEXT: cmoveq %rbx, %r8
-; AVX512VL-NEXT: cmovneq %rbx, %r9
-; AVX512VL-NEXT: movq %r9, %r10
-; AVX512VL-NEXT: shldq %cl, %r8, %r10
-; AVX512VL-NEXT: movq %rsi, %r8
-; AVX512VL-NEX...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/185455
More information about the llvm-commits
mailing list