[llvm] c753533 - [X86] combineConcatVectorOps - add 256-bit concat(shuffle(),shuffle()) handling
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 02:48:35 PST 2024
Author: Simon Pilgrim
Date: 2024-11-06T10:47:01Z
New Revision: c75353313ed73c6dc04beb322954bb905906f4a1
URL: https://github.com/llvm/llvm-project/commit/c75353313ed73c6dc04beb322954bb905906f4a1
DIFF: https://github.com/llvm/llvm-project/commit/c75353313ed73c6dc04beb322954bb905906f4a1.diff
LOG: [X86] combineConcatVectorOps - add 256-bit concat(shuffle(),shuffle()) handling
Improve IsConcatFree detection to handle splat vector-loads (which can be folded as X86ISD::SUBV_BROADCAST_LOAD).
Fixes #114959
Added:
Modified:
llvm/lib/Target/X86/X86ISelLowering.cpp
llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f0e10fe668687e..d0d082020b3d24 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -56933,6 +56933,11 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
bool AllConstants = true;
bool AllSubs = true;
unsigned VecSize = VT.getSizeInBits();
+ SDValue BC0 = peekThroughBitcasts(SubOps[0].getOperand(Op));
+ if (isa<LoadSDNode>(BC0) && all_of(SubOps, [&](SDValue SubOp) {
+ return BC0 == peekThroughBitcasts(SubOp.getOperand(Op));
+ }))
+ return true;
for (unsigned I = 0, E = SubOps.size(); I != E; ++I) {
SDValue BC = peekThroughBitcasts(SubOps[I].getOperand(Op));
unsigned SubSize = BC.getValueSizeInBits();
@@ -56947,6 +56952,26 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
};
switch (Op0.getOpcode()) {
+ case ISD::VECTOR_SHUFFLE: {
+ if (NumOps == 2 && VT.is256BitVector() &&
+ (EltSizeInBits >= 32 || Subtarget.hasInt256()) &&
+ (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1))) {
+ int NumSubElts = Op0.getValueType().getVectorNumElements();
+ SmallVector<int> NewMask;
+ for (int M : cast<ShuffleVectorSDNode>(Ops[0])->getMask()) {
+ M = M >= NumSubElts ? M + NumSubElts : M;
+ NewMask.push_back(M);
+ }
+ for (int M : cast<ShuffleVectorSDNode>(Ops[1])->getMask()) {
+ if (0 <= M)
+ M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts;
+ NewMask.push_back(M);
+ }
+ return DAG.getVectorShuffle(VT, DL, ConcatSubOperand(VT, Ops, 0),
+ ConcatSubOperand(VT, Ops, 1), NewMask);
+ }
+ break;
+ }
case X86ISD::VBROADCAST: {
if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) {
return Op.getOperand(0).getValueType().is128BitVector();
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 02934cd9db73c3..b93fd4b8f62fbf 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -1810,40 +1810,12 @@ define <4 x double> @broadcast_v4f64_0000_from_v2i64(<2 x i64> %a0) {
; PR114959
define <4 x double> @concat_v4f64_0213_broadcasts(ptr %src) {
-; AVX1OR2-LABEL: concat_v4f64_0213_broadcasts:
-; AVX1OR2: # %bb.0:
-; AVX1OR2-NEXT: vmovups (%rdi), %xmm0
-; AVX1OR2-NEXT: vmovups 32(%rdi), %xmm1
-; AVX1OR2-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX1OR2-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1OR2-NEXT: retq
-;
-; AVX512VL-SLOW-LABEL: concat_v4f64_0213_broadcasts:
-; AVX512VL-SLOW: # %bb.0:
-; AVX512VL-SLOW-NEXT: vmovups (%rdi), %xmm0
-; AVX512VL-SLOW-NEXT: vmovups 32(%rdi), %xmm1
-; AVX512VL-SLOW-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX512VL-SLOW-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX512VL-SLOW-NEXT: retq
-;
-; AVX512VL-FAST-ALL-LABEL: concat_v4f64_0213_broadcasts:
-; AVX512VL-FAST-ALL: # %bb.0:
-; AVX512VL-FAST-ALL-NEXT: vmovupd (%rdi), %xmm1
-; AVX512VL-FAST-ALL-NEXT: vmovupd 32(%rdi), %xmm2
-; AVX512VL-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm0 = [0,4,1,5]
-; AVX512VL-FAST-ALL-NEXT: vpermi2pd %ymm2, %ymm1, %ymm0
-; AVX512VL-FAST-ALL-NEXT: retq
-;
-; AVX512VL-FAST-PERLANE-LABEL: concat_v4f64_0213_broadcasts:
-; AVX512VL-FAST-PERLANE: # %bb.0:
-; AVX512VL-FAST-PERLANE-NEXT: vmovups (%rdi), %xmm0
-; AVX512VL-FAST-PERLANE-NEXT: vmovups 32(%rdi), %xmm1
-; AVX512VL-FAST-PERLANE-NEXT: vmovlhps {{.*#+}} xmm2 = xmm0[0],xmm1[0]
-; AVX512VL-FAST-PERLANE-NEXT: vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX512VL-FAST-PERLANE-NEXT: retq
+; ALL-LABEL: concat_v4f64_0213_broadcasts:
+; ALL: # %bb.0:
+; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
+; ALL-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[3],ymm0[3]
+; ALL-NEXT: retq
%src.hi = getelementptr inbounds i8, ptr %src, i64 32
%lo = load <2 x double>, ptr %src, align 1
%hi = load <2 x double>, ptr %src.hi, align 1
More information about the llvm-commits
mailing list