[llvm] r289846 - [x86] use a single shufps for 256-bit vectors when it can save instructions
Sanjay Patel via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 15 10:43:46 PST 2016
Author: spatel
Date: Thu Dec 15 12:43:46 2016
New Revision: 289846
URL: http://llvm.org/viewvc/llvm-project?rev=289846&view=rev
Log:
[x86] use a single shufps for 256-bit vectors when it can save instructions
This is the 256-bit counterpart to the 128-bit transform checked in here:
https://reviews.llvm.org/rL289837
This patch is based on the draft by @sroland (Roland Scheidegger) that is
attached to PR27885:
https://llvm.org/bugs/show_bug.cgi?id=27885
Modified:
llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
llvm/trunk/test/CodeGen/X86/pmul.ll
llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll
Modified: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86ISelLowering.cpp?rev=289846&r1=289845&r2=289846&view=diff
==============================================================================
--- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp Thu Dec 15 12:43:46 2016
@@ -12232,7 +12232,9 @@ static SDValue lowerV8I32VectorShuffle(c
// efficient instructions that mirror the shuffles across the two 128-bit
// lanes.
SmallVector<int, 4> RepeatedMask;
- if (is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask)) {
+ bool Is128BitLaneRepeatedShuffle =
+ is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
+ if (Is128BitLaneRepeatedShuffle) {
assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
if (V2.isUndef())
return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
@@ -12273,6 +12275,16 @@ static SDValue lowerV8I32VectorShuffle(c
return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
}
+ // Assume that a single SHUFPS is faster than an alternative sequence of
+ // multiple instructions (even if the CPU has a domain penalty).
+ // If some CPU is harmed by the domain switch, we can fix it in a later pass.
+ if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
+ SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
+ SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
+ SDValue ShufPS = DAG.getVectorShuffle(MVT::v8f32, DL, CastV1, CastV2, Mask);
+ return DAG.getBitcast(MVT::v8i32, ShufPS);
+ }
+
// Try to simplify this by merging 128-bit lanes to enable a lane-based
// shuffle.
if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
Modified: llvm/trunk/test/CodeGen/X86/pmul.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pmul.ll?rev=289846&r1=289845&r2=289846&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pmul.ll (original)
+++ llvm/trunk/test/CodeGen/X86/pmul.ll Thu Dec 15 12:43:46 2016
@@ -1390,17 +1390,15 @@ define <8 x i32> @mul_v8i64_zero_upper(<
;
; AVX2-LABEL: mul_v8i64_zero_upper:
; AVX2: # BB#0: # %entry
-; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1
; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
-; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX2-NEXT: retq
;
@@ -1410,9 +1408,7 @@ define <8 x i32> @mul_v8i64_zero_upper(<
; AVX512-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; AVX512-NEXT: vpmuludq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
-; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,1,1,3,4,5,5,7]
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,3,2,3,5,7,6,7]
-; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; AVX512-NEXT: retq
entry:
Modified: llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll?rev=289846&r1=289845&r2=289846&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll (original)
+++ llvm/trunk/test/CodeGen/X86/vector-shuffle-256-v8.ll Thu Dec 15 12:43:46 2016
@@ -1223,47 +1223,28 @@ define <8 x i32> @shuffle_v8i32_08084c4c
}
define <8 x i32> @shuffle_v8i32_8823cc67(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_8823cc67:
-; AVX1: # BB#0:
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_8823cc67:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,2,3,4,4,6,7]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_8823cc67:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[0,0],ymm0[2,3],ymm1[4,4],ymm0[6,7]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 8, i32 8, i32 2, i32 3, i32 12, i32 12, i32 6, i32 7>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_9832dc76(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_9832dc76:
-; AVX1: # BB#0:
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_9832dc76:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,0,3,2,5,4,7,6]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_9832dc76:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[3,2],ymm1[5,4],ymm0[7,6]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 3, i32 2, i32 13, i32 12, i32 7, i32 6>
ret <8 x i32> %shuffle
}
define <8 x i32> @shuffle_v8i32_9810dc54(<8 x i32> %a, <8 x i32> %b) {
-; AVX1-LABEL: shuffle_v8i32_9810dc54:
-; AVX1: # BB#0:
-; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
-; AVX1-NEXT: retq
-;
-; AVX2OR512VL-LABEL: shuffle_v8i32_9810dc54:
-; AVX2OR512VL: # BB#0:
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,1,0,4,5,5,4]
-; AVX2OR512VL-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,0,2,3,5,4,6,7]
-; AVX2OR512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
-; AVX2OR512VL-NEXT: retq
+; ALL-LABEL: shuffle_v8i32_9810dc54:
+; ALL: # BB#0:
+; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,0],ymm0[1,0],ymm1[5,4],ymm0[5,4]
+; ALL-NEXT: retq
%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 9, i32 8, i32 1, i32 0, i32 13, i32 12, i32 5, i32 4>
ret <8 x i32> %shuffle
}
More information about the llvm-commits
mailing list