[llvm] r313373 - [X86] Prefer VPERMQ over VPERM2F128 for any unary shuffle, not just the ones that can be done with a insertf128
Craig Topper via llvm-commits
llvm-commits at lists.llvm.org
Fri Sep 15 11:11:13 PDT 2017
Author: ctopper
Date: Fri Sep 15 11:11:13 2017
New Revision: 313373
URL: http://llvm.org/viewvc/llvm-project?rev=313373&view=rev
Log:
[X86] Prefer VPERMQ over VPERM2F128 for any unary shuffle, not just the ones that can be done with a insertf128
The early out for AVX2 in lowerV2X128VectorShuffle is positioned in a weird spot below some shuffle mask equivalency checks.
But I think we want to allow VPERMQ for any unary shuffle.
Differential Revision: https://reviews.llvm.org/D37893
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll
llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=313373&r1=313372&r2=313373&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Fri Sep 15 11:11:13 2017
@@ -12151,6 +12151,10 @@ static SDValue lowerV2X128VectorShuffle(
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
+ if (Subtarget.hasAVX2() && V2.isUndef())
+ return SDValue();
+
SmallVector<int, 4> WidenedMask;
if (!canWidenShuffleElements(Mask, WidenedMask))
return SDValue();
@@ -12174,9 +12178,6 @@ static SDValue lowerV2X128VectorShuffle(
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
- // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
- if (Subtarget.hasAVX2() && V2.isUndef())
- return SDValue();
// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
// this will likely become vinsertf128 which can't fold a 256-bit memop.
Modified: llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll?rev=313373&r1=313372&r2=313373&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx-vperm2x128.ll Fri Sep 15 11:11:13 2017
@@ -228,10 +228,15 @@ entry:
}
define <8 x float> @shuffle_v8f32_uu67uu67(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_uu67uu67:
-; ALL: # BB#0: # %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_uu67uu67:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_uu67uu67:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX2-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 undef, i32 undef, i32 6, i32 7>
ret <8 x float> %shuffle
@@ -258,10 +263,15 @@ entry:
}
define <8 x float> @shuffle_v8f32_uu674567(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
-; ALL-LABEL: shuffle_v8f32_uu674567:
-; ALL: # BB#0: # %entry
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT: retq
+; AVX1-LABEL: shuffle_v8f32_uu674567:
+; AVX1: # BB#0: # %entry
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: shuffle_v8f32_uu674567:
+; AVX2: # BB#0: # %entry
+; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,3,2,3]
+; AVX2-NEXT: retq
entry:
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 undef, i32 undef, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
ret <8 x float> %shuffle
Modified: llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll?rev=313373&r1=313372&r2=313373&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll (original)
+++ llvm/trunk/test/CodeGen/X86/avx512-shuffles/partial_permute.ll Fri Sep 15 11:11:13 2017
@@ -2115,10 +2115,10 @@ define <4 x i64> @test_masked_8xi64_to_4
; CHECK: # BB#0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; CHECK-NEXT: movb $2, %al
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpermq {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
+; CHECK-NEXT: vmovdqa %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
%res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> %vec2
@@ -2130,10 +2130,9 @@ define <4 x i64> @test_masked_z_8xi64_to
; CHECK: # BB#0:
; CHECK-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; CHECK-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; CHECK-NEXT: movb $2, %al
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vmovdqa64 %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x i64> %vec, <8 x i64> undef, <4 x i32> <i32 6, i32 3, i32 6, i32 3>
%res = select <4 x i1> <i1 0, i1 1, i1 0, i1 0>, <4 x i64> %shuf, <4 x i64> zeroinitializer
@@ -3849,7 +3848,7 @@ define <4 x double> @test_8xdouble_to_4x
; CHECK: # BB#0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
; CHECK-NEXT: retq
%res = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
ret <4 x double> %res
@@ -3859,10 +3858,10 @@ define <4 x double> @test_masked_8xdoubl
; CHECK: # BB#0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm2[1],ymm0[1],ymm2[3],ymm0[3]
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; CHECK-NEXT: movb $4, %al
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; CHECK-NEXT: vpermpd {{.*#+}} ymm1 {%k1} = ymm0[2,3,2,3]
+; CHECK-NEXT: vmovapd %ymm1, %ymm0
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
%res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> %vec2
@@ -3874,10 +3873,9 @@ define <4 x double> @test_masked_z_8xdou
; CHECK: # BB#0:
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; CHECK-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm1[1],ymm0[1],ymm1[3],ymm0[3]
-; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; CHECK-NEXT: movb $4, %al
; CHECK-NEXT: kmovd %eax, %k1
-; CHECK-NEXT: vmovapd %ymm0, %ymm0 {%k1} {z}
+; CHECK-NEXT: vpermpd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,2,3]
; CHECK-NEXT: retq
%shuf = shufflevector <8 x double> %vec, <8 x double> undef, <4 x i32> <i32 7, i32 3, i32 7, i32 3>
%res = select <4 x i1> <i1 0, i1 0, i1 1, i1 0>, <4 x double> %shuf, <4 x double> zeroinitializer
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll?rev=313373&r1=313372&r2=313373&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll Fri Sep 15 11:11:13 2017
@@ -722,21 +722,33 @@ define <8 x float> @shuffle_v8f32_321032
}
define <8 x float> @shuffle_v8f32_76547654(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_76547654:
-; ALL: # BB#0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8f32_76547654:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_76547654:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x float> %shuffle
}
define <8 x float> @shuffle_v8f32_76543210(<8 x float> %a, <8 x float> %b) {
-; ALL-LABEL: shuffle_v8f32_76543210:
-; ALL: # BB#0:
-; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; ALL-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; ALL-NEXT: retq
+; AVX1OR2-LABEL: shuffle_v8f32_76543210:
+; AVX1OR2: # BB#0:
+; AVX1OR2-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX1OR2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8f32_76543210:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x float> %shuffle
}
@@ -1809,11 +1821,17 @@ define <8 x i32> @shuffle_v8i32_76547654
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i32_76547654:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8i32_76547654:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_76547654:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 7, i32 6, i32 5, i32 4>
ret <8 x i32> %shuffle
}
@@ -1825,11 +1843,17 @@ define <8 x i32> @shuffle_v8i32_76543210
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
; AVX1-NEXT: retq
;
-; AVX2OR512VL-LABEL: shuffle_v8i32_76543210:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
-; AVX2OR512VL-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
-; AVX2OR512VL-NEXT: retq
+; AVX2-LABEL: shuffle_v8i32_76543210:
+; AVX2: # BB#0:
+; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX2-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX2-NEXT: retq
+;
+; AVX512VL-LABEL: shuffle_v8i32_76543210:
+; AVX512VL: # BB#0:
+; AVX512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[3,2,1,0,7,6,5,4]
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
+; AVX512VL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
ret <8 x i32> %shuffle
}
More information about the llvm-commits
mailing list