[llvm] 0fb198e - [X86] Remove combineShuffleOfConcatUndef fold (#144524)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jun 17 07:30:53 PDT 2025
Author: Simon Pilgrim
Date: 2025-06-17T15:30:49+01:00
New Revision: 0fb198e132eff36281a20698588d815c3c30f991
URL: https://github.com/llvm/llvm-project/commit/0fb198e132eff36281a20698588d815c3c30f991
DIFF: https://github.com/llvm/llvm-project/commit/0fb198e132eff36281a20698588d815c3c30f991.diff
LOG: [X86] Remove combineShuffleOfConcatUndef fold (#144524)
We can now let a mixture of combineConcatVectorOps and target shuffle combining handle this instead of creating ISD::CONCAT_VECTORS nodes and hoping they will merge properly.
In the horizontal-sum.ll test changes we were creating a ISD::CONCAT_VECTORS node that was being split shortly after, but not before causing issues with HADD folding due to additional uses.
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/horizontal-sum.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cd02d275d6b57..12fcc614ab254 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43301,51 +43301,6 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, const SDLoc &DL,
return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
}
-// We are looking for a shuffle where both sources are concatenated with undef
-// and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
-// if we can express this as a single-source shuffle, that's preferable.
-static SDValue combineShuffleOfConcatUndef(SDNode *N, const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
- return SDValue();
-
- EVT VT = N->getValueType(0);
-
- // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
- if (!VT.is128BitVector() && !VT.is256BitVector())
- return SDValue();
-
- if (VT.getVectorElementType() != MVT::i32 &&
- VT.getVectorElementType() != MVT::i64 &&
- VT.getVectorElementType() != MVT::f32 &&
- VT.getVectorElementType() != MVT::f64)
- return SDValue();
-
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
-
- // Check that both sources are concats with undef.
- if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
- N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
- N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
- !N1.getOperand(1).isUndef())
- return SDValue();
-
- // Construct the new shuffle mask. Elements from the first source retain their
- // index, but elements from the second source no longer need to skip an undef.
- SmallVector<int, 8> Mask;
- int NumElts = VT.getVectorNumElements();
-
- auto *SVOp = cast<ShuffleVectorSDNode>(N);
- for (int Elt : SVOp->getMask())
- Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
-
- SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
- N1.getOperand(0));
- return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
-}
-
/// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the
/// low half of each source vector and does not set any high half elements in
/// the destination vector, narrow the shuffle to half its original size.
@@ -43401,15 +43356,6 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true))
return LD;
- // For AVX2, we sometimes want to combine
- // (vector_shuffle <mask> (concat_vectors t1, undef)
- // (concat_vectors t2, undef))
- // Into:
- // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
- // Since the latter can be efficiently lowered with VPERMD/VPERMQ
- if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, dl, DAG, Subtarget))
- return ShufConcat;
-
if (isTargetShuffle(N->getOpcode())) {
SDValue Op(N, 0);
if (SDValue Shuffle = combineTargetShuffle(Op, dl, DAG, DCI, Subtarget))
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 0afc4f784bc5e..568150cfa3971 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -247,13 +247,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
-; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8
+; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
-; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
-; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
+; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -268,13 +268,13 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
-; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8
+; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4
; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
-; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
-; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3
-; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
-; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
+; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
+; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -424,7 +424,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -447,7 +447,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm1[2,3]
; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
-; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
+; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
More information about the llvm-commits
mailing list