[llvm] 57a551a - [X86][AVX] lowerShuffleAsLanePermuteAndShuffle - don't split element rotate patterns
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 13 04:05:20 PST 2022
Author: Simon Pilgrim
Date: 2022-01-13T11:59:08Z
New Revision: 57a551a8dfa7560f4b838f8b96980852af2f73d5
URL: https://github.com/llvm/llvm-project/commit/57a551a8dfa7560f4b838f8b96980852af2f73d5
DIFF: https://github.com/llvm/llvm-project/commit/57a551a8dfa7560f4b838f8b96980852af2f73d5.diff
LOG: [X86][AVX] lowerShuffleAsLanePermuteAndShuffle - don't split element rotate patterns
Partial element rotate patterns (e.g. for element insertion on Issue #53124) were being split if every lane wasn't crossing, but really there's a good repeated mask hiding in there.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b95624e2f6550..80d3ed06437e4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -16544,23 +16544,30 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
return V;
+ // Always allow ElementRotate patterns - these are sometimes hidden but its
+ // still better to avoid splitting.
+ SDValue RotV1 = V1, RotV2 = V2;
+ bool IsElementRotate = 0 <= matchShuffleAsElementRotate(RotV1, RotV2, Mask);
+
// If there are only inputs from one 128-bit lane, splitting will in fact be
// less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
- if (!Subtarget.hasAVX2()) {
- bool LaneCrossing[2] = {false, false};
- for (int i = 0; i < Size; ++i)
- if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
- LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
- if (!LaneCrossing[0] || !LaneCrossing[1])
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
- } else {
- bool LaneUsed[2] = {false, false};
- for (int i = 0; i < Size; ++i)
- if (Mask[i] >= 0)
- LaneUsed[(Mask[i] % Size) / LaneSize] = true;
- if (!LaneUsed[0] || !LaneUsed[1])
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ if (!IsElementRotate) {
+ if (!Subtarget.hasAVX2()) {
+ bool LaneCrossing[2] = {false, false};
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
+ LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
+ if (!LaneCrossing[0] || !LaneCrossing[1])
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ } else {
+ bool LaneUsed[2] = {false, false};
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ LaneUsed[(Mask[i] % Size) / LaneSize] = true;
+ if (!LaneUsed[0] || !LaneUsed[1])
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+ }
}
// TODO - we could support shuffling V2 in the Flipped input.
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 34ff47edf18aa..2e29d18067209 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -1345,11 +1345,9 @@ define <8 x float> @shuffle_v8f32_01452367(<8 x float> %a) {
define <8 x float> @shuffle_v8f32_089abcde(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_089abcde:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,1,2]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,0,1,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
@@ -1402,11 +1400,9 @@ define <8 x float> @shuffle_v8f32_0189abcd(<8 x float> %a, <8 x float> %b) {
define <8 x float> @shuffle_v8f32_01289abc(<8 x float> %a, <8 x float> %b) {
; AVX1-LABEL: shuffle_v8f32_01289abc:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[0,0,0,0]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT: vblendps {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3]
-; AVX1-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[1,2,3,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2],ymm1[2,0],ymm2[5,6],ymm1[6,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX1-NEXT: retq
;
@@ -2880,10 +2876,9 @@ define <8 x i32> @shuffle_v8i32_ba983210(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_089abcde:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10,11]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,2]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,0],ymm1[0,0],ymm2[7,4],ymm1[4,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[0,2],ymm1[1,2],ymm2[4,6],ymm1[5,6]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
@@ -2954,10 +2949,9 @@ define <8 x i32> @shuffle_v8i32_0189abcd(<8 x i32> %a, <8 x i32> %b) {
define <8 x i32> @shuffle_v8i32_01289abc(<8 x i32> %a, <8 x i32> %b) {
; AVX1-LABEL: shuffle_v8i32_01289abc:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm1[2,3,0,1]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm1[0,0],ymm2[3,0],ymm1[4,4],ymm2[7,4]
+; AVX1-NEXT: vshufps {{.*#+}} ymm1 = ymm2[1,2],ymm1[2,0],ymm2[5,6],ymm1[6,4]
; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
; AVX1-NEXT: retq
;
More information about the llvm-commits
mailing list