[llvm] 3079e51 - [X86][SSE] Generalize shuffle(HORIZOP,HORIZOP) -> HORIZOP combine
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Sun Apr 5 04:10:18 PDT 2020
Author: Simon Pilgrim
Date: 2020-04-05T12:09:19+01:00
New Revision: 3079e518589ca9a1b1b2eb20bd8abb80c32ba5ec
URL: https://github.com/llvm/llvm-project/commit/3079e518589ca9a1b1b2eb20bd8abb80c32ba5ec
DIFF: https://github.com/llvm/llvm-project/commit/3079e518589ca9a1b1b2eb20bd8abb80c32ba5ec.diff
LOG: [X86][SSE] Generalize shuffle(HORIZOP,HORIZOP) -> HORIZOP combine
Our existing combine allows to merge the shuffle of 2 similar 64-bit wide 'horizontal ops' (HADD/PACK/etc.) if the shuffle was a UNPCK/MOVSD.
This patch generalizes this to decode any target shuffle mask that can be widened to a 128-bit repeating v2*64 mask, which helps us catch PBLENDW/PBLENDD cases.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
llvm/test/CodeGen/X86/vector-trunc.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 59b5700d4804..5ea4cfaddb92 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5429,6 +5429,12 @@ static bool isInRange(int Val, int Low, int Hi) {
return (Val >= Low && Val < Hi);
}
+/// Return true if every element value in Mask falls within the specified
+/// range (L, H].
+static bool isInRange(ArrayRef<int> Mask, int Low, int Hi) {
+ return llvm::all_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
+}
+
/// Return true if the value of any element in Mask falls within the specified
/// range (L, H].
static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
@@ -35358,31 +35364,43 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
SmallVector<int, 4> Mask;
unsigned Opcode = N.getOpcode();
+ bool IsUnary;
+ SmallVector<int, 64> TargetMask;
+ SmallVector<SDValue, 2> TargetOps;
+ if (isTargetShuffle(Opcode))
+ getTargetShuffleMask(N.getNode(), VT, false, TargetOps, TargetMask, IsUnary);
+
// Combine binary shuffle of 2 similar 'Horizontal' instructions into a
- // single instruction.
- if (VT.getScalarSizeInBits() == 64 &&
- (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
- Opcode == X86ISD::UNPCKL)) {
- auto BC0 = peekThroughBitcasts(N.getOperand(0));
- auto BC1 = peekThroughBitcasts(N.getOperand(1));
- EVT VT0 = BC0.getValueType();
- EVT VT1 = BC1.getValueType();
- unsigned Opcode0 = BC0.getOpcode();
- unsigned Opcode1 = BC1.getOpcode();
- if (Opcode0 == Opcode1 && VT0 == VT1 &&
- (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
- Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
- Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
- SDValue Lo, Hi;
- if (Opcode == X86ISD::MOVSD) {
- Lo = BC1.getOperand(0);
- Hi = BC0.getOperand(1);
- } else {
- Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
- Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
+ // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
+ // represents the LHS/RHS inputs for the lower/upper halves.
+ SmallVector<int, 16> TargetMask128;
+ if (!TargetMask.empty() && TargetOps.size() == 2 &&
+ is128BitLaneRepeatedShuffleMask(VT, TargetMask, TargetMask128)) {
+ SmallVector<int, 16> WidenedMask128 = TargetMask128;
+ while (WidenedMask128.size() > 2) {
+ SmallVector<int, 16> WidenedMask;
+ if (!canWidenShuffleElements(WidenedMask128, WidenedMask))
+ break;
+ WidenedMask128 = std::move(WidenedMask);
+ }
+ if (WidenedMask128.size() == 2 && isInRange(WidenedMask128, 0, 4)) {
+ SDValue BC0 = peekThroughBitcasts(TargetOps[0]);
+ SDValue BC1 = peekThroughBitcasts(TargetOps[1]);
+ EVT VT0 = BC0.getValueType();
+ EVT VT1 = BC1.getValueType();
+ unsigned Opcode0 = BC0.getOpcode();
+ unsigned Opcode1 = BC1.getOpcode();
+ if (Opcode0 == Opcode1 && VT0 == VT1 &&
+ (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
+ Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
+ Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
+ SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
+ SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
+ Lo = Lo.getOperand(WidenedMask128[0] & 1);
+ Hi = Hi.getOperand(WidenedMask128[1] & 1);
+ SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
+ return DAG.getBitcast(VT, Horiz);
}
- SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
- return DAG.getBitcast(VT, Horiz);
}
}
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
index 588eddc3c93d..798c95d9c1e8 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-512.ll
@@ -15,21 +15,17 @@ define i8 @v8i64(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c, <8 x i64> %d) {
; SSE-NEXT: pcmpgtq %xmm7, %xmm3
; SSE-NEXT: pcmpgtq %xmm6, %xmm2
; SSE-NEXT: packssdw %xmm3, %xmm2
-; SSE-NEXT: packssdw %xmm2, %xmm2
; SSE-NEXT: pcmpgtq %xmm5, %xmm1
; SSE-NEXT: pcmpgtq %xmm4, %xmm0
; SSE-NEXT: packssdw %xmm1, %xmm0
-; SSE-NEXT: packssdw %xmm0, %xmm0
-; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT: packssdw %xmm2, %xmm0
; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: packssdw %xmm11, %xmm10
-; SSE-NEXT: packssdw %xmm10, %xmm1
; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT: pcmpgtq {{[0-9]+}}(%rsp), %xmm8
; SSE-NEXT: packssdw %xmm9, %xmm8
-; SSE-NEXT: packssdw %xmm0, %xmm8
-; SSE-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm1[4,5,6,7]
+; SSE-NEXT: packssdw %xmm10, %xmm8
; SSE-NEXT: pand %xmm0, %xmm8
; SSE-NEXT: packsswb %xmm0, %xmm8
; SSE-NEXT: pmovmskb %xmm8, %eax
@@ -115,21 +111,17 @@ define i8 @v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c, <8 x double>
; SSE-NEXT: cmpltpd %xmm3, %xmm7
; SSE-NEXT: cmpltpd %xmm2, %xmm6
; SSE-NEXT: packssdw %xmm7, %xmm6
-; SSE-NEXT: packssdw %xmm6, %xmm2
; SSE-NEXT: cmpltpd %xmm1, %xmm5
; SSE-NEXT: cmpltpd %xmm0, %xmm4
; SSE-NEXT: packssdw %xmm5, %xmm4
-; SSE-NEXT: packssdw %xmm0, %xmm4
-; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[4,5,6,7]
+; SSE-NEXT: packssdw %xmm6, %xmm4
; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm11
; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm10
; SSE-NEXT: packssdw %xmm11, %xmm10
-; SSE-NEXT: packssdw %xmm10, %xmm0
; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm9
; SSE-NEXT: cmpltpd {{[0-9]+}}(%rsp), %xmm8
; SSE-NEXT: packssdw %xmm9, %xmm8
-; SSE-NEXT: packssdw %xmm0, %xmm8
-; SSE-NEXT: pblendw {{.*#+}} xmm8 = xmm8[0,1,2,3],xmm0[4,5,6,7]
+; SSE-NEXT: packssdw %xmm10, %xmm8
; SSE-NEXT: pand %xmm4, %xmm8
; SSE-NEXT: packsswb %xmm0, %xmm8
; SSE-NEXT: pmovmskb %xmm8, %eax
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index d04d20109506..40c7ee9724ea 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -295,12 +295,10 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512F-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512F-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm1
; AVX512F-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512F-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512F-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX512F-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512F-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX512F-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512F-NEXT: retq
;
@@ -328,12 +326,10 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512BW-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm1
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512BW-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512BW-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX512BW-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX512BW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512BW-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512BW-NEXT: retq
;
@@ -352,12 +348,10 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm1 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm2, %xmm1
-; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm0, %xmm1
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm2 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512VBMI-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3],mem[4],xmm0[5,6,7]
; AVX512VBMI-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX512VBMI-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX512VBMI-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX512VBMI-NEXT: vmovdqa %xmm0, (%rsi)
; AVX512VBMI-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index 2ab4a7cb5724..28dddd759930 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -1398,12 +1398,10 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],xmm4[1,2,3],xmm3[4],xmm4[5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3],xmm2[4],xmm4[5,6,7]
; SSE41-NEXT: packusdw %xmm3, %xmm2
-; SSE41-NEXT: packusdw %xmm2, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1,2,3],xmm1[4],xmm4[5,6,7]
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm4[1,2,3],xmm0[4],xmm4[5,6,7]
; SSE41-NEXT: packusdw %xmm1, %xmm0
-; SSE41-NEXT: packusdw %xmm0, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
+; SSE41-NEXT: packusdw %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: trunc2x4i64_8i16:
@@ -1413,13 +1411,11 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
; AVX1-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
+; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
@@ -1430,13 +1426,11 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3],xmm1[4],xmm3[5,6,7]
; AVX2-NEXT: vpackusdw %xmm2, %xmm1, %xmm1
-; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm1
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1,2,3],xmm2[4],xmm3[5,6,7]
; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3],xmm0[4],xmm3[5,6,7]
; AVX2-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
+; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
More information about the llvm-commits
mailing list