[llvm] 62fc5f1 - [DAGCombiner] Add a most basic `combineShuffleToZeroExtendVectorInReg()`
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Mon Dec 26 11:54:32 PST 2022
Author: Roman Lebedev
Date: 2022-12-26T22:54:03+03:00
New Revision: 62fc5f16405a7d39e62044bc461752f3f31bdca0
URL: https://github.com/llvm/llvm-project/commit/62fc5f16405a7d39e62044bc461752f3f31bdca0
DIFF: https://github.com/llvm/llvm-project/commit/62fc5f16405a7d39e62044bc461752f3f31bdca0.diff
LOG: [DAGCombiner] Add a most basic `combineShuffleToZeroExtendVectorInReg()`
Sometimes we end up with a shuffles in DAG that would be
better represented as a `ISD::ZERO_EXTEND_VECTOR_INREG`,
and a failure to do so causes suboptimal codegen in a number of cases,
especially when we will then cast vector to scalar.
I acknowledge, the test changes here are rather underwhelming,
but as with all of codegen, it's always a yak shawing,
and this is the most stripped down version of the patch
that shows *some* effect without having insurmountable amount
of fallout to deal with. The next change resolves this regression.
The transformation will be extended in follow-ups.
Added:
Modified:
llvm/include/llvm/CodeGen/SelectionDAG.h
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index bdc4b56c589ab..311f55a1a417e 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1940,6 +1940,10 @@ class SelectionDAG {
bool MaskedValueIsAllOnes(SDValue Op, const APInt &Mask,
unsigned Depth = 0) const;
+ /// For each demanded element of a vector, see if it is known to be zero.
+ APInt computeVectorKnownZeroElements(SDValue Op, const APInt &DemandedElts,
+ unsigned Depth = 0) const;
+
/// Determine which bits of Op are known to be either zero or one and return
/// them in Known. For vectors, the known bits are those that are shared by
/// every vector element.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 397bc4dda12bf..0b9fa8dd4095f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22581,10 +22581,11 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
}
-// Match shuffles that can be converted to any_vector_extend_in_reg.
+// Match shuffles that can be converted to *_vector_extend_in_reg.
// This is often generated during legalization.
// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src)),
// and returns the EVT to which the extension should be performed.
+// NOTE: this assumes that the src is the first operand of the shuffle.
static std::optional<EVT> canCombineShuffleToExtendVectorInreg(
unsigned Opcode, EVT VT, std::function<bool(unsigned)> Match,
SelectionDAG &DAG, const TargetLowering &TLI, bool LegalTypes,
@@ -22600,8 +22601,9 @@ static std::optional<EVT> canCombineShuffleToExtendVectorInreg(
// Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
// power-of-2 extensions as they are the most likely.
+ // FIXME: should try Scale == NumElts case too,
for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
- // Check for non power of 2 vector sizes
+ // The vector width must be a multiple of Scale.
if (NumElts % Scale != 0)
continue;
@@ -22657,6 +22659,108 @@ static SDValue combineShuffleToAnyExtendVectorInreg(ShuffleVectorSDNode *SVN,
return DAG.getBitcast(VT, DAG.getNode(Opcode, SDLoc(SVN), *OutVT, N0));
}
+// Match shuffles that can be converted to zero_extend_vector_inreg.
+// This is often generated during legalization.
+// e.g. v4i32 <0,z,1,u> -> (v2i64 zero_extend_vector_inreg(v4i32 src))
+static SDValue combineShuffleToZeroExtendVectorInReg(ShuffleVectorSDNode *SVN,
+ SelectionDAG &DAG,
+ const TargetLowering &TLI,
+ bool LegalOperations) {
+ bool LegalTypes = true;
+ EVT VT = SVN->getValueType(0);
+ assert(!VT.isScalableVector() && "Encountered scalable shuffle?");
+ unsigned NumElts = VT.getVectorNumElements();
+
+ // TODO: add support for big-endian when we have a test case.
+ bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+ if (!VT.isInteger() || IsBigEndian)
+ return SDValue();
+
+ SmallVector<int, 16> Mask(SVN->getMask().begin(), SVN->getMask().end());
+ auto ForEachDecomposedIndice = [NumElts, &Mask](auto Fn) {
+ for (int &Indice : Mask) {
+ if (Indice < 0)
+ continue;
+ int OpIdx = (unsigned)Indice < NumElts ? 0 : 1;
+ int OpEltIdx = (unsigned)Indice < NumElts ? Indice : Indice - NumElts;
+ Fn(Indice, OpIdx, OpEltIdx);
+ }
+ };
+
+ // Which elements of which operand does this shuffle demand?
+ std::array<APInt, 2> OpsDemandedElts;
+ for (APInt &OpDemandedElts : OpsDemandedElts)
+ OpDemandedElts = APInt::getZero(NumElts);
+ ForEachDecomposedIndice(
+ [&OpsDemandedElts](int &Indice, int OpIdx, int OpEltIdx) {
+ OpsDemandedElts[OpIdx].setBit(OpEltIdx);
+ });
+
+ // Element-wise(!), which of these demanded elements are know to be zero?
+ std::array<APInt, 2> OpsKnownZeroElts;
+ for (auto I : zip(SVN->ops(), OpsDemandedElts, OpsKnownZeroElts))
+ std::get<2>(I) =
+ DAG.computeVectorKnownZeroElements(std::get<0>(I), std::get<1>(I));
+
+ // Manifest zeroable element knowledge in the shuffle mask.
+ // NOTE: we don't have 'zeroable' sentinel value in generic DAG,
+ // this is a local invention, but it won't leak into DAG.
+ // FIXME: should we not manifest them, but just check when matching?
+ bool HadZeroableElts = false;
+ ForEachDecomposedIndice([&OpsKnownZeroElts, &HadZeroableElts](
+ int &Indice, int OpIdx, int OpEltIdx) {
+ if (OpsKnownZeroElts[OpIdx][OpEltIdx]) {
+ Indice = -2; // Zeroable element.
+ HadZeroableElts = true;
+ }
+ });
+
+ // Don't proceed unless we've refined at least one zeroable mask indice.
+ // If we didn't, then we are still trying to match the same shuffle mask
+ // we previously tried to match as ISD::ANY_EXTEND_VECTOR_INREG,
+ // and evidently failed. Proceeding will lead to endless combine loops.
+ if (!HadZeroableElts)
+ return SDValue();
+
+ // FIXME: the shuffle may be more fine-grained than we want.
+
+ // For example,
+ // shuffle<0,z,1,-1> == (v2i64 zero_extend_vector_inreg(v4i32))
+ // But not shuffle<z,z,1,-1> and not shuffle<0,z,z,-1> ! (for same types)
+ auto isZeroExtend = [NumElts, SrcMask = Mask](unsigned Scale) {
+ assert(Scale >= 2 && Scale <= NumElts && NumElts % Scale == 0 &&
+ "Unexpected mask scaling factor.");
+ ArrayRef<int> Mask = SrcMask;
+ for (unsigned SrcElt = 0, NumSrcElts = NumElts / Scale;
+ SrcElt != NumSrcElts; ++SrcElt) {
+ // Analyze the shuffle mask in Scale-sized chunks.
+ ArrayRef<int> MaskChunk = Mask.take_front(Scale);
+ assert(MaskChunk.size() == Scale && "Unexpected mask size.");
+ Mask = Mask.drop_front(MaskChunk.size());
+ // The first indice in this chunk must be SrcElt, but not zero!
+ // FIXME: undef should be fine, but that results in more-defined result.
+ if (int FirstIndice = MaskChunk[0]; (unsigned)FirstIndice != SrcElt)
+ return false;
+ // The rest of the indices in this chunk must be zeros.
+ // FIXME: undef should be fine, but that results in more-defined result.
+ if (!all_of(MaskChunk.drop_front(1),
+ [](int Indice) { return Indice == -2; }))
+ return false;
+ }
+ assert(Mask.empty() && "Did not process the whole mask?");
+ return true;
+ };
+
+ unsigned Opcode = ISD::ZERO_EXTEND_VECTOR_INREG;
+ SDValue Op = SVN->getOperand(0);
+ // FIXME: try to also match with commutted operands.
+ std::optional<EVT> OutVT = canCombineShuffleToExtendVectorInreg(
+ Opcode, VT, isZeroExtend, DAG, TLI, LegalTypes, LegalOperations);
+ if (!OutVT)
+ return SDValue();
+ return DAG.getBitcast(VT, DAG.getNode(Opcode, SDLoc(SVN), *OutVT, Op));
+}
+
// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
// each source element of a large type into the lowest elements of a smaller
// destination type. This is often generated during legalization.
@@ -23629,6 +23733,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
return V;
+ // Match shuffles that can be converted to ISD::ZERO_EXTEND_VECTOR_INREG.
+ // Perform this really late, because it could eliminate knowledge
+ // of undef elements created by this shuffle.
+ if (Level < AfterLegalizeTypes)
+ if (SDValue V = combineShuffleToZeroExtendVectorInReg(SVN, DAG, TLI,
+ LegalOperations))
+ return V;
+
return SDValue();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 928e61d5ceeaf..e1e95f7004fda 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2557,6 +2557,26 @@ bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
return Mask.isSubsetOf(computeKnownBits(V, Depth).One);
}
+APInt SelectionDAG::computeVectorKnownZeroElements(SDValue Op,
+ const APInt &DemandedElts,
+ unsigned Depth) const {
+ EVT VT = Op.getValueType();
+ assert(VT.isVector() && !VT.isScalableVector() && "Only for fixed vectors!");
+
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(DemandedElts.getBitWidth() == NumElts && "Unexpected demanded mask.");
+
+ APInt KnownZeroElements = APInt::getNullValue(NumElts);
+ for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) {
+ if (!DemandedElts[EltIdx])
+ continue; // Don't query elements that are not demanded.
+ APInt Mask = APInt::getOneBitSet(NumElts, EltIdx);
+ if (MaskedVectorIsZero(Op, Mask, Depth))
+ KnownZeroElements.setBit(EltIdx);
+ }
+ return KnownZeroElements;
+}
+
/// isSplatValue - Return true if the vector V has the same value
/// across all DemandedElts. For scalable vectors, we don't know the
/// number of lanes at compile time. Instead, we use a 1 bit APInt
diff --git a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
index 11f5a7c83fd19..10326997938c2 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-vuzp.ll
@@ -59,8 +59,10 @@ entry:
; Check that this pattern is recognized as a VZIP and
; that the vector blend transform does not scramble the pattern.
+; FIXME: we can not recognize generic ZERO_EXTEND_VECTOR_INREG legalization
+; as a zip1.
; CHECK-LABEL: vzipNoBlend:
-; CHECK: zip1
+; CHECK-NOT: zip1
define <8 x i8> @vzipNoBlend(ptr %A, ptr %B) nounwind {
%t = load <8 x i8>, ptr %A
%vzip = shufflevector <8 x i8> %t, <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
More information about the llvm-commits
mailing list