[llvm] e4d25a9 - [DAG] BUILD_VECTOR: absorb ZERO_EXTEND of a single first operand if all other ops are zeros
Roman Lebedev via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 30 14:08:10 PST 2022
Author: Roman Lebedev
Date: 2022-12-31T00:58:11+03:00
New Revision: e4d25a9c234b1501bc4ac01f4e40d75f6ebee172
URL: https://github.com/llvm/llvm-project/commit/e4d25a9c234b1501bc4ac01f4e40d75f6ebee172
DIFF: https://github.com/llvm/llvm-project/commit/e4d25a9c234b1501bc4ac01f4e40d75f6ebee172.diff
LOG: [DAG] BUILD_VECTOR: absorb ZERO_EXTEND of a single first operand if all other ops are zeros
This kind of pattern seems to come up as regressions
with better ZERO_EXTEND_VECTOR_INREG recognition.
For initial implementation, this is quite restricted
to the minimal viable transform, otherwise there are
too many regressions to be dealt with.
Added:
Modified:
llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
llvm/test/CodeGen/AArch64/build-vector-extract.ll
llvm/test/CodeGen/X86/buildvec-extract.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e6f8b98e9c8e4..ee2918e419404 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -612,6 +612,7 @@ namespace {
SDValue splitMergedValStore(StoreSDNode *ST);
SDValue TransformFPLoadStorePair(SDNode *N);
SDValue convertBuildVecZextToZext(SDNode *N);
+ SDValue convertBuildVecZextToBuildVecWithZeros(SDNode *N);
SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
SDValue reduceBuildVecTruncToBitCast(SDNode *N);
SDValue reduceBuildVecToShuffle(SDNode *N);
@@ -21451,6 +21452,117 @@ SDValue DAGCombiner::convertBuildVecZextToZext(SDNode *N) {
VT, In);
}
+// If this is a very simple BUILD_VECTOR with first element being a ZERO_EXTEND,
+// and all other elements being constant zero's, granularize the BUILD_VECTOR's
+// element width, absorbing the ZERO_EXTEND, turning it into a constant zero op.
+// This patten can appear during legalization.
+//
+// NOTE: This can be generalized to allow more than a single
+// non-constant-zero op, UNDEF's, and to be KnownBits-based,
+SDValue DAGCombiner::convertBuildVecZextToBuildVecWithZeros(SDNode *N) {
+ // Don't run this after legalization. Targets may have other preferences.
+ if (Level >= AfterLegalizeDAG)
+ return SDValue();
+
+ // FIXME: support big-endian.
+ if (DAG.getDataLayout().isBigEndian())
+ return SDValue();
+
+ EVT VT = N->getValueType(0);
+ EVT OpVT = N->getOperand(0).getValueType();
+ assert(!VT.isScalableVector() && "Encountered scalable BUILD_VECTOR?");
+
+ unsigned EltBitwidth = VT.getScalarSizeInBits();
+ // NOTE: the actual width of operands may be wider than that!
+
+ // Analyze all operands of this BUILD_VECTOR. What is the largest number of
+ // active bits they all have? We'll want to truncate them all to that width.
+ unsigned ActiveBits = 0;
+ APInt KnownZeroOps(VT.getVectorNumElements(), 0);
+ for (auto I : enumerate(N->ops())) {
+ SDValue Op = I.value();
+ // FIXME: support UNDEF elements?
+ if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
+ unsigned OpActiveBits =
+ Cst->getAPIntValue().trunc(EltBitwidth).getActiveBits();
+ if (OpActiveBits == 0) {
+ KnownZeroOps.setBit(I.index());
+ continue;
+ }
+ // Profitability check: don't allow non-zero constant operands.
+ return SDValue();
+ }
+ // Profitability check: there must only be a single non-zero operand,
+ // and it must be the first operand of the BUILD_VECTOR.
+ if (I.index() != 0)
+ return SDValue();
+ // The operand must be a zero-extension itself.
+ // FIXME: this could be generalized to known leading zeros check.
+ if (Op.getOpcode() != ISD::ZERO_EXTEND)
+ return SDValue();
+ unsigned CurrActiveBits =
+ Op.getOperand(0).getValueSizeInBits().getFixedSize();
+ assert(!ActiveBits && "Already encountered non-constant-zero operand?");
+ ActiveBits = CurrActiveBits;
+ // We want to at least halve the element size.
+ if (2 * ActiveBits > EltBitwidth)
+ return SDValue();
+ }
+
+ // This BUILD_VECTOR must have at least one non-constant-zero operand.
+ if (ActiveBits == 0)
+ return SDValue();
+
+ // We have EltBitwidth bits, the *minimal* chunk size is ActiveBits,
+ // into how many chunks can we split our element width?
+ unsigned Factor = divideCeil(EltBitwidth, ActiveBits);
+ assert(Factor > 1 && "Did not split the element after all?");
+ assert(EltBitwidth % Factor == 0 && "Can not split into this many chunks?");
+ unsigned ChunkBitwidth = EltBitwidth / Factor;
+ assert(ChunkBitwidth >= ActiveBits && "Underestimated chunk size?");
+ assert(ChunkBitwidth < EltBitwidth && "Failed to reduce element width?");
+
+ EVT OpIntVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
+ EVT NewScalarIntVT = EVT::getIntegerVT(*DAG.getContext(), ChunkBitwidth);
+ EVT NewIntVT = EVT::getVectorVT(*DAG.getContext(), NewScalarIntVT,
+ Factor * N->getNumOperands());
+
+ // Never create illegal types.
+ if (!TLI.isTypeLegal(OpIntVT) || !TLI.isTypeLegal(NewScalarIntVT) ||
+ !TLI.isTypeLegal(NewIntVT))
+ return SDValue();
+
+ if (LegalOperations &&
+ !(TLI.isOperationLegalOrCustom(ISD::BITCAST, OpIntVT) &&
+ TLI.isOperationLegalOrCustom(ISD::TRUNCATE, NewScalarIntVT) &&
+ TLI.isOperationLegalOrCustom(ISD::BUILD_VECTOR, NewIntVT)))
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue ZeroOp = DAG.getConstant(0, DL, NewScalarIntVT);
+
+ // Recreate the BUILD_VECTOR, with elements now being Factor times smaller.
+ SmallVector<SDValue, 16> NewOps;
+ NewOps.reserve(NewIntVT.getVectorNumElements());
+ for (auto I : enumerate(N->ops())) {
+ SDValue Op = I.value();
+ // FIXME: after allowing UNDEF's, do handle them here.
+ unsigned SrcOpIdx = I.index();
+ if (KnownZeroOps[SrcOpIdx]) {
+ NewOps.append(Factor, ZeroOp);
+ continue;
+ }
+ Op = DAG.getBitcast(OpIntVT, Op);
+ Op = DAG.getNode(ISD::TRUNCATE, DL, NewScalarIntVT, Op);
+ NewOps.emplace_back(Op);
+ NewOps.append(Factor - 1, ZeroOp);
+ }
+ assert(NewOps.size() == NewIntVT.getVectorNumElements());
+ SDValue NewBV = DAG.getBuildVector(NewIntVT, DL, NewOps);
+ NewBV = DAG.getBitcast(VT, NewBV);
+ return NewBV;
+}
+
SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
EVT VT = N->getValueType(0);
@@ -21516,6 +21628,9 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
if (SDValue V = convertBuildVecZextToZext(N))
return V;
+ if (SDValue V = convertBuildVecZextToBuildVecWithZeros(N))
+ return V;
+
if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
return V;
diff --git a/llvm/test/CodeGen/AArch64/build-vector-extract.ll b/llvm/test/CodeGen/AArch64/build-vector-extract.ll
index 7b60a398fa7b6..53e8b568f7096 100644
--- a/llvm/test/CodeGen/AArch64/build-vector-extract.ll
+++ b/llvm/test/CodeGen/AArch64/build-vector-extract.ll
@@ -16,8 +16,7 @@ define <2 x i64> @extract0_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; CHECK-LABEL: extract0_i32_zext_insert0_i64_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: fmov w8, s0
-; CHECK-NEXT: mov v1.d[0], x8
+; CHECK-NEXT: mov v1.s[0], v0.s[0]
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 0
@@ -42,8 +41,7 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; CHECK-LABEL: extract1_i32_zext_insert0_i64_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: mov w8, v0.s[1]
-; CHECK-NEXT: mov v1.d[0], x8
+; CHECK-NEXT: mov v1.s[0], v0.s[1]
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 1
@@ -68,8 +66,7 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; CHECK-LABEL: extract2_i32_zext_insert0_i64_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: mov w8, v0.s[2]
-; CHECK-NEXT: mov v1.d[0], x8
+; CHECK-NEXT: mov v1.s[0], v0.s[2]
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 2
@@ -94,8 +91,7 @@ define <2 x i64> @extract3_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; CHECK-LABEL: extract3_i32_zext_insert0_i64_zero:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.2d, #0000000000000000
-; CHECK-NEXT: mov w8, v0.s[3]
-; CHECK-NEXT: mov v1.d[0], x8
+; CHECK-NEXT: mov v1.s[0], v0.s[3]
; CHECK-NEXT: mov v0.16b, v1.16b
; CHECK-NEXT: ret
%e = extractelement <4 x i32> %x, i32 3
diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll
index ebd027b979e73..e63d6249991fe 100644
--- a/llvm/test/CodeGen/X86/buildvec-extract.ll
+++ b/llvm/test/CodeGen/X86/buildvec-extract.ll
@@ -69,9 +69,9 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract1_i32_zext_insert0_i64_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0]
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract1_i32_zext_insert0_i64_zero:
@@ -114,9 +114,9 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) {
define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) {
; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; SSE2-NEXT: pxor %xmm0, %xmm0
-; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE2-NEXT: xorps %xmm1, %xmm1
+; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero:
@@ -375,8 +375,7 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero:
; SSE2: # %bb.0:
-; SSE2-NEXT: pextrw $0, %xmm0, %eax
-; SSE2-NEXT: movd %eax, %xmm0
+; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE2-NEXT: retq
;
; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero:
@@ -417,14 +416,14 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract1_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract1_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $1, %xmm0, %eax
-; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract1_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $1, %xmm0, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 1
%z = zext i16 %e to i64
@@ -453,14 +452,14 @@ define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract2_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract2_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $2, %xmm0, %eax
-; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract2_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $2, %xmm0, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 2
%z = zext i16 %e to i64
@@ -487,14 +486,14 @@ define <2 x i64> @extract3_i16_zext_insert0_i64_undef(<8 x i16> %x) {
define <2 x i64> @extract3_i16_zext_insert0_i64_zero(<8 x i16> %x) {
; SSE-LABEL: extract3_i16_zext_insert0_i64_zero:
; SSE: # %bb.0:
-; SSE-NEXT: pextrw $3, %xmm0, %eax
-; SSE-NEXT: movd %eax, %xmm0
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: extract3_i16_zext_insert0_i64_zero:
; AVX: # %bb.0:
-; AVX-NEXT: vpextrw $3, %xmm0, %eax
-; AVX-NEXT: vmovd %eax, %xmm0
+; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
; AVX-NEXT: retq
%e = extractelement <8 x i16> %x, i32 3
%z = zext i16 %e to i64
More information about the llvm-commits
mailing list