[llvm] 77d625f - [DAG] MergeInnerShuffle with BinOps - sometimes accept undef mask elements
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 1 06:37:37 PDT 2021
Author: Simon Pilgrim
Date: 2021-04-01T14:33:00+01:00
New Revision: 77d625f8d8aa08cd162d20af51c41776a5034705
URL: https://github.com/llvm/llvm-project/commit/77d625f8d8aa08cd162d20af51c41776a5034705
DIFF: https://github.com/llvm/llvm-project/commit/77d625f8d8aa08cd162d20af51c41776a5034705.diff
LOG: [DAG] MergeInnerShuffle with BinOps - sometimes accept undef mask elements
If the inner shuffle already contains undef elements, then accept them in the merged shuffle as well.
This helps some X86 HADD/SUB patterns where slow targets were ending up with HADD/SUB because the (un)merged shuffles were stuck either side of the ADD/SUB - meaning we ended up with a total cost much higher than the "2*shuffle+add" that a slow target usually expands a HADD/SUB to.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/X86/haddsub-undef.ll
llvm/test/CodeGen/X86/phaddsub.ll
llvm/test/CodeGen/X86/x86-interleaved-access.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e280f3dd37e9d..c626bdf809dab 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -21285,11 +21285,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
SDValue Op1 = LeftOp ? Op10 : Op11;
if (Commute)
std::swap(Op0, Op1);
- return Op0.getOpcode() == ISD::VECTOR_SHUFFLE &&
- InnerN->isOnlyUserOf(Op0.getNode()) &&
- MergeInnerShuffle(Commute, SVN, cast<ShuffleVectorSDNode>(Op0),
- Op1, TLI, SV0, SV1, Mask) &&
- llvm::none_of(Mask, [](int M) { return M < 0; });
+ // Only accept the merged shuffle if we don't introduce undef elements,
+ // or the inner shuffle already contained undef elements.
+ auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(Op0);
+ return SVN0 && InnerN->isOnlyUserOf(SVN0) &&
+ MergeInnerShuffle(Commute, SVN, SVN0, Op1, TLI, SV0, SV1,
+ Mask) &&
+ (llvm::any_of(SVN0->getMask(), [](int M) { return M < 0; }) ||
+ llvm::none_of(Mask, [](int M) { return M < 0; }));
};
// Ensure we don't increase the number of shuffles - we must merge a
diff --git a/llvm/test/CodeGen/X86/haddsub-undef.ll b/llvm/test/CodeGen/X86/haddsub-undef.ll
index 1c06749440ee2..442342ef2cca8 100644
--- a/llvm/test/CodeGen/X86/haddsub-undef.ll
+++ b/llvm/test/CodeGen/X86/haddsub-undef.ll
@@ -506,17 +506,32 @@ define <4 x float> @add_ps_007(<4 x float> %x) {
}
define <4 x float> @add_ps_030(<4 x float> %x) {
-; SSE-LABEL: add_ps_030:
-; SSE: # %bb.0:
-; SSE-NEXT: haddps %xmm0, %xmm0
-; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; SSE-NEXT: retq
+; SSE-SLOW-LABEL: add_ps_030:
+; SSE-SLOW: # %bb.0:
+; SSE-SLOW-NEXT: movaps %xmm0, %xmm1
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm0[2,3]
+; SSE-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; SSE-SLOW-NEXT: addps %xmm1, %xmm0
+; SSE-SLOW-NEXT: retq
;
-; AVX-LABEL: add_ps_030:
-; AVX: # %bb.0:
-; AVX-NEXT: vhaddps %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX-NEXT: retq
+; SSE-FAST-LABEL: add_ps_030:
+; SSE-FAST: # %bb.0:
+; SSE-FAST-NEXT: haddps %xmm0, %xmm0
+; SSE-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; SSE-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: add_ps_030:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; AVX-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: add_ps_030:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX-FAST-NEXT: retq
%l = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
%r = shufflevector <4 x float> %x, <4 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
%add = fadd <4 x float> %l, %r
diff --git a/llvm/test/CodeGen/X86/phaddsub.ll b/llvm/test/CodeGen/X86/phaddsub.ll
index d999f5089a21f..d7d8c5f8e67a5 100644
--- a/llvm/test/CodeGen/X86/phaddsub.ll
+++ b/llvm/test/CodeGen/X86/phaddsub.ll
@@ -341,17 +341,38 @@ define <4 x i32> @phaddd_single_source1(<4 x i32> %x) {
}
define <4 x i32> @phaddd_single_source2(<4 x i32> %x) {
-; SSSE3-LABEL: phaddd_single_source2:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; SSSE3-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddd_single_source2:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
;
-; AVX-LABEL: phaddd_single_source2:
-; AVX: # %bb.0:
-; AVX-NEXT: vphaddd %xmm0, %xmm0, %xmm0
-; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX-NEXT: retq
+; SSSE3-FAST-LABEL: phaddd_single_source2:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
+; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; SSSE3-FAST-NEXT: retq
+;
+; AVX-SLOW-LABEL: phaddd_single_source2:
+; AVX-SLOW: # %bb.0:
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; AVX-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX-SLOW-NEXT: retq
+;
+; AVX-FAST-LABEL: phaddd_single_source2:
+; AVX-FAST: # %bb.0:
+; AVX-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm0
+; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,2,3]
+; AVX-FAST-NEXT: retq
+;
+; AVX2-SHUF-LABEL: phaddd_single_source2:
+; AVX2-SHUF: # %bb.0:
+; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
+; AVX2-SHUF-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,0,2,3]
+; AVX2-SHUF-NEXT: vpaddd %xmm1, %xmm0, %xmm0
+; AVX2-SHUF-NEXT: retq
%l = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 2>
%r = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 undef, i32 1, i32 3>
%add = add <4 x i32> %l, %r
@@ -483,31 +504,37 @@ define <8 x i16> @phaddw_single_source1(<8 x i16> %x) {
}
define <8 x i16> @phaddw_single_source2(<8 x i16> %x) {
-; SSSE3-LABEL: phaddw_single_source2:
-; SSSE3: # %bb.0:
-; SSSE3-NEXT: phaddw %xmm0, %xmm0
-; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
-; SSSE3-NEXT: retq
+; SSSE3-SLOW-LABEL: phaddw_single_source2:
+; SSSE3-SLOW: # %bb.0:
+; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; SSSE3-SLOW-NEXT: paddw %xmm1, %xmm0
+; SSSE3-SLOW-NEXT: retq
+;
+; SSSE3-FAST-LABEL: phaddw_single_source2:
+; SSSE3-FAST: # %bb.0:
+; SSSE3-FAST-NEXT: phaddw %xmm0, %xmm0
+; SSSE3-FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
+; SSSE3-FAST-NEXT: retq
;
; AVX-SLOW-LABEL: phaddw_single_source2:
; AVX-SLOW: # %bb.0:
-; AVX-SLOW-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX-SLOW-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; AVX-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
+; AVX-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX-SLOW-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX-SLOW-NEXT: retq
;
; AVX-FAST-LABEL: phaddw_single_source2:
; AVX-FAST: # %bb.0:
; AVX-FAST-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX-FAST-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,6,7]
-; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2]
+; AVX-FAST-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,0,2,3,4,5,6,7]
; AVX-FAST-NEXT: retq
;
; AVX2-SHUF-LABEL: phaddw_single_source2:
; AVX2-SHUF: # %bb.0:
-; AVX2-SHUF-NEXT: vphaddw %xmm0, %xmm0, %xmm0
-; AVX2-SHUF-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,0,1,2,3,0,1,2,3,0,1,2,3,0,1]
+; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,1,2,3,4,5,6,7]
+; AVX2-SHUF-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
+; AVX2-SHUF-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX2-SHUF-NEXT: retq
%l = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 2, i32 4, i32 6>
%r = shufflevector <8 x i16> %x, <8 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 42808b3910e42..f99b065bc2ecc 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -385,83 +385,81 @@ ret void
define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
; AVX1-LABEL: interleaved_load_vf8_i8_stride4:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vmovdqa (%rdi), %xmm1
-; AVX1-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX1-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vmovdqa (%rdi), %xmm0
+; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
+; AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; AVX1-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
+; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[1,0,3,2,4,5,6,7]
+; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = <3,u,1,u,7,u,5,u,1,u,5,u,0,u,1,u>
; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX1-NEXT: retq
;
; AVX2-LABEL: interleaved_load_vf8_i8_stride4:
; AVX2: # %bb.0:
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vmovdqa (%rdi), %xmm1
-; AVX2-NEXT: vmovdqa 16(%rdi), %xmm2
-; AVX2-NEXT: vpshufb %xmm0, %xmm2, %xmm3
-; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX2-NEXT: vpshufb %xmm4, %xmm2, %xmm2
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
+; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm3
+; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX2-NEXT: vpblendw {{.*#+}} xmm4 = xmm1[0],xmm3[1],xmm1[2],xmm3[3],xmm1[4],xmm3[5],xmm1[6],xmm3[7]
+; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
+; AVX2-NEXT: vpackusdw %xmm4, %xmm3, %xmm3
+; AVX2-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
+; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm4 = xmm1[1,0,3,2,4,5,6,7]
+; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm4 = <3,u,1,u,7,u,5,u,1,u,5,u,0,u,1,u>
; AVX2-NEXT: vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-NEXT: vpaddb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX2-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb %xmm4, %xmm0, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpaddb %xmm3, %xmm0, %xmm0
+; AVX2-NEXT: vpmullw %xmm0, %xmm2, %xmm0
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX2-NEXT: retq
;
; AVX512-LABEL: interleaved_load_vf8_i8_stride4:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqu (%rdi), %ymm0
-; AVX512-NEXT: vpmovdb %zmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vmovdqa (%rdi), %xmm2
-; AVX512-NEXT: vmovdqa 16(%rdi), %xmm3
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm4
-; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm1 = <2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm1, %xmm3, %xmm4
-; AVX512-NEXT: vpshufb %xmm1, %xmm2, %xmm1
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = <3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u>
-; AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3
-; AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
-; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX512-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX512-NEXT: vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-NEXT: vmovdqa 16(%rdi), %xmm1
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm2 = [1,0,3,2,33,32,35,34]
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm4
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm3
+; AVX512-NEXT: vpermi2w %zmm4, %zmm3, %zmm2
+; AVX512-NEXT: vprold $8, %zmm4, %zmm4
+; AVX512-NEXT: vprold $8, %zmm3, %zmm3
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
+; AVX512-NEXT: vpaddb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
+; AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1
+; AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
+; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512-NEXT: vmovdqu (%rdi), %ymm1
+; AVX512-NEXT: vpmovdw %zmm1, %ymm1
+; AVX512-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512-NEXT: vpmullw %xmm2, %xmm0, %xmm0
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
More information about the llvm-commits
mailing list