[llvm] 03482bc - [X86] collectConcatOps - add ability to collect from vector 'widening' patterns
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 27 07:39:10 PDT 2022
Author: Simon Pilgrim
Date: 2022-04-27T15:38:58+01:00
New Revision: 03482bccad33274c74b385fc139bbb25be156543
URL: https://github.com/llvm/llvm-project/commit/03482bccad33274c74b385fc139bbb25be156543
DIFF: https://github.com/llvm/llvm-project/commit/03482bccad33274c74b385fc139bbb25be156543.diff
LOG: [X86] collectConcatOps - add ability to collect from vector 'widening' patterns
Recognise insert_subvector(undef, x, lo/hi) patterns where we double the width of a vector - creating an UNDEF subvector on the fly.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/oddshuffles.ll
llvm/test/CodeGen/X86/oddsubvector.ll
llvm/test/CodeGen/X86/pr11334.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8cc6a28fadacc..539d2f25a0cf2 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -6322,7 +6322,8 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
// Helper function to collect subvector ops that are concatenated together,
// either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
// The subvectors in Ops are guaranteed to be the same type.
-static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
+static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops,
+ SelectionDAG &DAG) {
assert(Ops.empty() && "Expected an empty ops vector");
if (N->getOpcode() == ISD::CONCAT_VECTORS) {
@@ -6338,21 +6339,34 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
EVT SubVT = Sub.getValueType();
// TODO - Handle more general insert_subvector chains.
- if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
- Idx == (VT.getVectorNumElements() / 2)) {
- // insert_subvector(insert_subvector(undef, x, lo), y, hi)
- if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
- Src.getOperand(1).getValueType() == SubVT &&
- isNullConstant(Src.getOperand(2))) {
- Ops.push_back(Src.getOperand(1));
+ if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) {
+ // insert_subvector(undef, x, lo)
+ if (Idx == 0 && Src.isUndef()) {
Ops.push_back(Sub);
+ Ops.push_back(DAG.getUNDEF(SubVT));
return true;
}
- // insert_subvector(x, extract_subvector(x, lo), hi)
- if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
- Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
- Ops.append(2, Sub);
- return true;
+ if (Idx == (VT.getVectorNumElements() / 2)) {
+ // insert_subvector(insert_subvector(undef, x, lo), y, hi)
+ if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ Src.getOperand(1).getValueType() == SubVT &&
+ isNullConstant(Src.getOperand(2))) {
+ Ops.push_back(Src.getOperand(1));
+ Ops.push_back(Sub);
+ return true;
+ }
+ // insert_subvector(x, extract_subvector(x, lo), hi)
+ if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+ Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
+ Ops.append(2, Sub);
+ return true;
+ }
+ // insert_subvector(undef, x, hi)
+ if (Src.isUndef()) {
+ Ops.push_back(DAG.getUNDEF(SubVT));
+ Ops.push_back(Sub);
+ return true;
+ }
}
}
}
@@ -6811,7 +6825,7 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
}
}
SmallVector<SDValue, 2> CatOps;
- if (collectConcatOps(V.getNode(), CatOps)) {
+ if (collectConcatOps(V.getNode(), CatOps, DAG)) {
for (SDValue &CatOp : CatOps) {
SDValue NotCat = IsNOT(CatOp, DAG);
if (!NotCat) return SDValue();
@@ -25278,7 +25292,8 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
!Subtarget.hasBWI())) {
SmallVector<SDValue, 4> CatOps;
- if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
+ if (StoredVal.hasOneUse() &&
+ collectConcatOps(StoredVal.getNode(), CatOps, DAG))
return splitVectorStore(St, DAG);
return SDValue();
}
@@ -39744,7 +39759,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
return SDValue();
SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1));
SmallVector<SDValue> SubOps;
- if (collectConcatOps(Src.getNode(), SubOps) && SubOps.size() == 2)
+ if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2)
return SubOps[Idx & 1];
unsigned NumElts = Src.getValueType().getVectorNumElements();
if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
@@ -43724,8 +43739,8 @@ static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG,
SDValue FVal = N->getOperand(2);
SmallVector<SDValue, 4> CatOpsT, CatOpsF;
if (!TVal.hasOneUse() || !FVal.hasOneUse() ||
- !collectConcatOps(TVal.getNode(), CatOpsT) ||
- !collectConcatOps(FVal.getNode(), CatOpsF))
+ !collectConcatOps(TVal.getNode(), CatOpsT, DAG) ||
+ !collectConcatOps(FVal.getNode(), CatOpsF, DAG))
return SDValue();
auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL,
@@ -45048,7 +45063,7 @@ static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
// MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)).
if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) {
SmallVector<SDValue> Ops;
- if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops) &&
+ if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) &&
Ops.size() == 2) {
SDLoc DL(EFLAGS);
EVT SubVT = Ops[0].getValueType().changeTypeToInteger();
@@ -49683,7 +49698,8 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
// PACK should still be worth it for 128-bit vectors if the sources were
// originally concatenated from subvectors.
SmallVector<SDValue> ConcatOps;
- if (VT.getSizeInBits() > 128 || !collectConcatOps(In.getNode(), ConcatOps))
+ if (VT.getSizeInBits() > 128 ||
+ !collectConcatOps(In.getNode(), ConcatOps, DAG))
return SDValue();
}
@@ -53607,7 +53623,7 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
// Match concat_vector style patterns.
SmallVector<SDValue, 2> SubVectorOps;
- if (collectConcatOps(N, SubVectorOps)) {
+ if (collectConcatOps(N, SubVectorOps, DAG)) {
if (SDValue Fold =
combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget))
return Fold;
@@ -53669,7 +53685,7 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
SDValue Sel = peekThroughBitcasts(Ext->getOperand(0));
SmallVector<SDValue, 4> CatOps;
if (Sel.getOpcode() != ISD::VSELECT ||
- !collectConcatOps(Sel.getOperand(0).getNode(), CatOps))
+ !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG))
return SDValue();
// Note: We assume simple value types because this should only be called with
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index 34f6d9ffb6799..be99de22eeb30 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -2430,14 +2430,11 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) {
; AVX1-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
; AVX1-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vmovups %ymm0, (%rsi)
+; AVX1-NEXT: vmovdqa %xmm0, 16(%rsi)
+; AVX1-NEXT: vmovdqa %xmm7, 112(%rsi)
+; AVX1-NEXT: vmovdqa %xmm6, 48(%rsi)
; AVX1-NEXT: vmovups %ymm1, 128(%rsi)
-; AVX1-NEXT: vmovups %ymm7, 96(%rsi)
-; AVX1-NEXT: vmovups %ymm6, 32(%rsi)
; AVX1-NEXT: vmovupd %ymm5, 192(%rsi)
; AVX1-NEXT: vmovups %ymm4, 224(%rsi)
; AVX1-NEXT: vmovups %ymm3, 160(%rsi)
@@ -2461,11 +2458,9 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) {
; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,1,1]
-; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm2
-; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,3,3,3]
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[1,1,1,1]
; AVX2-NEXT: vpbroadcastd %xmm0, %ymm4
; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm0[3,3,3,3,7,7,7,7]
; AVX2-NEXT: vpshufd {{.*#+}} ymm6 = ymm0[2,3,2,3,6,7,6,7]
@@ -2475,9 +2470,9 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) {
; AVX2-NEXT: vmovdqu %ymm6, 192(%rsi)
; AVX2-NEXT: vmovdqu %ymm5, 224(%rsi)
; AVX2-NEXT: vmovdqu %ymm4, (%rsi)
-; AVX2-NEXT: vmovdqu %ymm3, 64(%rsi)
-; AVX2-NEXT: vmovdqu %ymm2, 32(%rsi)
-; AVX2-NEXT: vmovdqu %ymm1, 96(%rsi)
+; AVX2-NEXT: vmovdqa %xmm3, 48(%rsi)
+; AVX2-NEXT: vmovdqa %xmm2, 112(%rsi)
+; AVX2-NEXT: vmovdqu %ymm1, 64(%rsi)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -2507,14 +2502,11 @@ define void @D107009(<64 x i32>* %input, <64 x i32>* %output) {
; XOP-NEXT: vpermilps {{.*#+}} ymm4 = ymm1[3,3,3,3,7,7,7,7]
; XOP-NEXT: vpermilpd {{.*#+}} ymm5 = ymm1[0,0,3,2]
; XOP-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,1,1]
-; XOP-NEXT: vinsertf128 $1, %xmm6, %ymm0, %ymm6
; XOP-NEXT: vpshufd {{.*#+}} xmm7 = xmm0[3,3,3,3]
-; XOP-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm7
-; XOP-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; XOP-NEXT: vmovups %ymm0, (%rsi)
+; XOP-NEXT: vmovdqa %xmm0, 16(%rsi)
+; XOP-NEXT: vmovdqa %xmm7, 112(%rsi)
+; XOP-NEXT: vmovdqa %xmm6, 48(%rsi)
; XOP-NEXT: vmovups %ymm1, 128(%rsi)
-; XOP-NEXT: vmovups %ymm7, 96(%rsi)
-; XOP-NEXT: vmovups %ymm6, 32(%rsi)
; XOP-NEXT: vmovupd %ymm5, 192(%rsi)
; XOP-NEXT: vmovups %ymm4, 224(%rsi)
; XOP-NEXT: vmovups %ymm3, 160(%rsi)
diff --git a/llvm/test/CodeGen/X86/oddsubvector.ll b/llvm/test/CodeGen/X86/oddsubvector.ll
index fb4c23e7de1a4..cda2ab5965509 100644
--- a/llvm/test/CodeGen/X86/oddsubvector.ll
+++ b/llvm/test/CodeGen/X86/oddsubvector.ll
@@ -91,11 +91,11 @@ define void @PR40815(%struct.Mat4* nocapture readonly dereferenceable(64), %stru
;
; AVX512-LABEL: PR40815:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovaps 16(%rdi), %xmm0
-; AVX512-NEXT: vmovaps 48(%rdi), %xmm1
-; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm0, %ymm0
-; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm1, %ymm1
-; AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
+; AVX512-NEXT: vmovaps 48(%rdi), %xmm0
+; AVX512-NEXT: vmovups 16(%rdi), %ymm1
+; AVX512-NEXT: vinsertf128 $1, (%rdi), %ymm1, %ymm1
+; AVX512-NEXT: vinsertf128 $1, 32(%rdi), %ymm0, %ymm0
+; AVX512-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vmovups %zmm0, (%rsi)
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/pr11334.ll b/llvm/test/CodeGen/X86/pr11334.ll
index 666f72ba06cb7..da3e8a6ad1232 100644
--- a/llvm/test/CodeGen/X86/pr11334.ll
+++ b/llvm/test/CodeGen/X86/pr11334.ll
@@ -94,9 +94,7 @@ define void @test_vector_creation() nounwind {
; AVX: # %bb.0:
; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0
; AVX-NEXT: vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
-; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT: vmovaps %ymm0, (%rax)
-; AVX-NEXT: vzeroupper
+; AVX-NEXT: vmovaps %xmm0, (%rax)
; AVX-NEXT: retq
%1 = insertelement <4 x double> undef, double 0.000000e+00, i32 2
%2 = load double, double addrspace(1)* null
More information about the llvm-commits
mailing list