[llvm] r365050 - [X86][AVX] combineX86ShufflesRecursively - peek through extract_subvector
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 3 08:46:08 PDT 2019
Author: rksimon
Date: Wed Jul 3 08:46:08 2019
New Revision: 365050
URL: http://llvm.org/viewvc/llvm-project?rev=365050&view=rev
Log:
[X86][AVX] combineX86ShufflesRecursively - peek through extract_subvector
If we have more then 2 shuffle ops to combine, try to use combineX86ShuffleChainWithExtract to see if some are from the same super vector.
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/pr29112.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll
llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=365050&r1=365049&r2=365050&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Wed Jul 3 08:46:08 2019
@@ -32738,29 +32738,34 @@ static SDValue combineX86ShufflesRecursi
return Cst;
// We can only combine unary and binary shuffle mask cases.
- if (Ops.size() > 2)
- return SDValue();
+ if (Ops.size() <= 2) {
+ // Minor canonicalization of the accumulated shuffle mask to make it easier
+ // to match below. All this does is detect masks with sequential pairs of
+ // elements, and shrink them to the half-width mask. It does this in a loop
+ // so it will reduce the size of the mask to the minimal width mask which
+ // performs an equivalent shuffle.
+ SmallVector<int, 64> WidenedMask;
+ while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
+ Mask = std::move(WidenedMask);
+ }
- // Minor canonicalization of the accumulated shuffle mask to make it easier
- // to match below. All this does is detect masks with sequential pairs of
- // elements, and shrink them to the half-width mask. It does this in a loop
- // so it will reduce the size of the mask to the minimal width mask which
- // performs an equivalent shuffle.
- SmallVector<int, 64> WidenedMask;
- while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
- Mask = std::move(WidenedMask);
- }
+ // Canonicalization of binary shuffle masks to improve pattern matching by
+ // commuting the inputs.
+ if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
+ ShuffleVectorSDNode::commuteMask(Mask);
+ std::swap(Ops[0], Ops[1]);
+ }
- // Canonicalization of binary shuffle masks to improve pattern matching by
- // commuting the inputs.
- if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
- ShuffleVectorSDNode::commuteMask(Mask);
- std::swap(Ops[0], Ops[1]);
+ // Finally, try to combine into a single shuffle instruction.
+ return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
+ AllowVariableMask, DAG, Subtarget);
}
- // Finally, try to combine into a single shuffle instruction.
- return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask,
- AllowVariableMask, DAG, Subtarget);
+ // If that failed and any input is extracted then try to combine as a
+ // shuffle with the larger type.
+ return combineX86ShuffleChainWithExtract(Ops, Root, Mask, Depth,
+ HasVariableMask, AllowVariableMask,
+ DAG, Subtarget);
}
/// Helper entry wrapper to combineX86ShufflesRecursively.
Modified: llvm/trunk/test/CodeGen/X86/pr29112.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr29112.ll?rev=365050&r1=365049&r2=365050&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr29112.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pr29112.ll Wed Jul 3 08:46:08 2019
@@ -10,53 +10,52 @@ define <4 x float> @bar(<4 x float>* %a1
; CHECK: # %bb.0:
; CHECK-NEXT: subq $72, %rsp
; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: vmovaps %xmm1, %xmm8
-; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm0 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925]
-; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm0
-; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm1
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm4
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm10 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm10[0,1],xmm2[1],xmm10[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm4[0],xmm1[1],xmm4[2,3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm11 = xmm0[0,1],xmm2[1],xmm0[3]
-; CHECK-NEXT: vextractf32x4 $3, %zmm3, %xmm7
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm4[0],xmm7[0],xmm4[1],xmm7[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm11[0,1,2],xmm3[1]
-; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm12
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm5 = xmm1[1,0]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm7[2],zero,zero
-; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm7[0,1],xmm2[1],xmm7[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm13 = xmm7[0,1,2],xmm5[0]
-; CHECK-NEXT: vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3]
-; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm7 = xmm4[0],xmm1[2],zero,zero
-; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm14[0,1,2],xmm3[1]
-; CHECK-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vinsertps {{.*#+}} xmm4 = xmm9[0,1],xmm2[1],xmm9[3]
+; CHECK-NEXT: vmovaps %xmm1, %xmm9
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm15 = [4,22,1,17,4,22,1,17,4,22,1,17,4,22,1,17]
+; CHECK-NEXT: # zmm15 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm15
+; CHECK-NEXT: vbroadcasti32x4 {{.*#+}} zmm10 = [4,30,1,22,4,30,1,22,4,30,1,22,4,30,1,22]
+; CHECK-NEXT: # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm10
+; CHECK-NEXT: vbroadcastsd {{.*#+}} zmm7 = [85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925,85899345925]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm7
+; CHECK-NEXT: vmovaps {{.*#+}} xmm8 = [4,28,1,29]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm8
+; CHECK-NEXT: vmovaps {{.*#+}} xmm4 = [4,21,1,7]
+; CHECK-NEXT: vpermi2ps %zmm3, %zmm2, %zmm4
+; CHECK-NEXT: vextractf128 $1, %ymm3, %xmm5
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm6
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm11 = xmm6[0],xmm5[0],xmm6[1],xmm5[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm11[0,1],xmm2[1],xmm11[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1]
; CHECK-NEXT: vinsertps {{.*#+}} xmm6 = xmm4[0,1,2],xmm3[1]
; CHECK-NEXT: vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm10[0,1],xmm2[3],xmm10[3]
; CHECK-NEXT: vextractf32x4 $2, %zmm3, %xmm4
-; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm14[0,1,2],xmm4[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm4 = xmm1[0,1,2],xmm4[3]
+; CHECK-NEXT: vpermilps {{.*#+}} xmm0 = xmm2[3,1,2,3]
+; CHECK-NEXT: vunpcklps {{.*#+}} xmm5 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
; CHECK-NEXT: vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
-; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm11[0,1,2],xmm3[3]
-; CHECK-NEXT: vblendps {{.*#+}} xmm9 = xmm14[0,1,2],xmm3[3]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[1]
-; CHECK-NEXT: vpermilpd {{.*#+}} xmm3 = xmm3[1,0]
-; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0]
-; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm2
-; CHECK-NEXT: vaddps %xmm9, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm13, %xmm1, %xmm9
-; CHECK-NEXT: vaddps %xmm12, %xmm0, %xmm0
-; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm3
-; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm7[0,1],xmm2[1],xmm7[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm7 = xmm0[0,1,2],xmm3[3]
+; CHECK-NEXT: vblendps {{.*#+}} xmm12 = xmm1[0,1,2],xmm3[3]
+; CHECK-NEXT: vpermilpd {{.*#+}} xmm13 = xmm3[1,0]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0,1,2],xmm3[1]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm3[1]
+; CHECK-NEXT: vaddps %xmm1, %xmm3, %xmm8
+; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm11[0,1],xmm2[3],xmm11[3]
+; CHECK-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm13[0]
+; CHECK-NEXT: vaddps %xmm15, %xmm2, %xmm2
+; CHECK-NEXT: vmovaps %xmm14, %xmm1
+; CHECK-NEXT: vmovaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT: vaddps %xmm10, %xmm14, %xmm10
+; CHECK-NEXT: vaddps %xmm14, %xmm14, %xmm3
+; CHECK-NEXT: vaddps %xmm12, %xmm15, %xmm0
+; CHECK-NEXT: vaddps %xmm8, %xmm0, %xmm0
+; CHECK-NEXT: vaddps %xmm0, %xmm14, %xmm0
; CHECK-NEXT: vmovaps %xmm3, {{[0-9]+}}(%rsp)
-; CHECK-NEXT: vmovaps %xmm9, (%rsp)
-; CHECK-NEXT: vmovaps %xmm8, %xmm3
+; CHECK-NEXT: vmovaps %xmm10, (%rsp)
+; CHECK-NEXT: vmovaps %xmm9, %xmm3
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: callq foo
; CHECK-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll?rev=365050&r1=365049&r2=365050&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512-widen.ll Wed Jul 3 08:46:08 2019
@@ -716,11 +716,9 @@ define <16 x i8> @trunc_shuffle_v64i8_01
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
+; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
@@ -801,11 +799,9 @@ define <16 x i8> @trunc_shuffle_v64i8_01
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
+; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll?rev=365050&r1=365049&r2=365050&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll (original)
+++ llvm/trunk/test/CodeGen/X86/shuffle-vs-trunc-512.ll Wed Jul 3 08:46:08 2019
@@ -707,11 +707,9 @@ define <16 x i8> @trunc_shuffle_v64i8_01
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321,4411615795313452321]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,61]
+; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
@@ -792,11 +790,9 @@ define <16 x i8> @trunc_shuffle_v64i8_01
;
; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm1 = [2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785,2096730264494998785]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm1
-; AVX512VBMI-NEXT: vpbroadcastq {{.*#+}} zmm2 = [4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257,4483673389351380257]
-; AVX512VBMI-NEXT: vpermb %zmm0, %zmm2, %zmm0
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; AVX512VBMI-NEXT: vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
+; AVX512VBMI-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
;
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll?rev=365050&r1=365049&r2=365050&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-512-v16.ll Wed Jul 3 08:46:08 2019
@@ -322,12 +322,9 @@ define <4 x i32> @test_v16i32_0_1_2_12 (
define <4 x i32> @test_v16i32_0_4_8_12(<16 x i32> %v) {
; ALL-LABEL: test_v16i32_0_4_8_12:
; ALL: # %bb.0:
-; ALL-NEXT: vextractf128 $1, %ymm0, %xmm1
-; ALL-NEXT: vunpcklps {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [17179869184,17179869184,17179869184,17179869184]
-; ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0
-; ALL-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
+; ALL-NEXT: vmovaps {{.*#+}} xmm1 = [0,4,8,12]
+; ALL-NEXT: vpermps %zmm0, %zmm1, %zmm0
+; ALL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; ALL-NEXT: vzeroupper
; ALL-NEXT: retq
%res = shufflevector <16 x i32> %v, <16 x i32> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
More information about the llvm-commits
mailing list