[llvm] Revert "[SLP] Do not skip tiny trees with gathered loads to vectorize" (PR #190176)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Apr 2 06:26:45 PDT 2026
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
This reverts commit 94ec7ffa46d351b86fbbe3a445ceef37f331c4a2 to fix
reported issue https://github.com/llvm/llvm-project/pull/190040#issuecomment-4177827078
---
Full diff: https://github.com/llvm/llvm-project/pull/190176.diff
4 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+1-1)
- (modified) llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll (+4-2)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll (+22-6)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll (+22-6)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 68bdc89b0a240..f7c78db5a83ac 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -16896,7 +16896,7 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
// No need to vectorize inserts of gathered values.
if (VectorizableTree.size() == 2 &&
isa<InsertElementInst>(VectorizableTree[0]->Scalars[0]) &&
- LoadEntriesToVectorize.empty() && VectorizableTree[1]->isGather() &&
+ VectorizableTree[1]->isGather() &&
(VectorizableTree[1]->getVectorFactor() <= 2 ||
!(isSplat(VectorizableTree[1]->Scalars) ||
allConstant(VectorizableTree[1]->Scalars))))
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
index 90dc555d7bc57..621f5363070a6 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll
@@ -154,7 +154,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4
; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4
; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4
-; THR15-NEXT: [[TMP0:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> align 1 zeroinitializer, <2 x i1> splat (i1 true), <2 x i8> poison)
+; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr null, align 1
+; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr null, align 1
; THR15-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1
; THR15-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i32>
; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1
@@ -222,7 +223,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt
; THR15-NEXT: [[TMP66:%.*]] = zext <4 x i8> [[TMP65]] to <4 x i32>
; THR15-NEXT: [[TMP67:%.*]] = sub <4 x i32> [[TMP64]], [[TMP66]]
; THR15-NEXT: [[TMP68:%.*]] = shufflevector <4 x i32> [[TMP67]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; THR15-NEXT: [[TMP70:%.*]] = shufflevector <2 x i8> [[TMP0]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; THR15-NEXT: [[TMP69:%.*]] = insertelement <4 x i8> poison, i8 [[TMP1]], i32 0
+; THR15-NEXT: [[TMP70:%.*]] = insertelement <4 x i8> [[TMP69]], i8 [[TMP0]], i32 1
; THR15-NEXT: [[TMP116:%.*]] = shufflevector <2 x i8> [[TMP62]], <2 x i8> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; THR15-NEXT: [[TMP71:%.*]] = shufflevector <4 x i8> [[TMP70]], <4 x i8> [[TMP116]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; THR15-NEXT: [[TMP72:%.*]] = zext <4 x i8> [[TMP71]] to <4 x i32>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index 4c8e88ca34078..8b58d0cdccf2c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -68,24 +68,40 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
;
; AVX512F-LABEL: define void @gather_load(
; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512F-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX512F-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]]
-; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 11>
-; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+; AVX512F-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX512F-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
; AVX512F-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: define void @gather_load(
; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512VL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512VL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX512VL-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]]
-; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 11>
-; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+; AVX512VL-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i32 0
+; AVX512VL-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
; AVX512VL-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
; AVX512VL-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512VL-NEXT: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 82f8886107251..2d6f007cb2341 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -68,24 +68,40 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
;
; AVX512F-LABEL: define void @gather_load(
; AVX512F-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512F-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512F-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
; AVX512F-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; AVX512F-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512F-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512F-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512F-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512F-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512F-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX512F-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]]
-; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 11>
-; AVX512F-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+; AVX512F-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX512F-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX512F-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512F-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512F-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
; AVX512F-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
; AVX512F-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512F-NEXT: ret void
;
; AVX512VL-LABEL: define void @gather_load(
; AVX512VL-SAME: ptr noalias captures(none) [[TMP0:%.*]], ptr noalias readonly captures(none) [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[SHORT_TBAA0:![0-9]+]]
; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 1
+; AVX512VL-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512VL-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512VL-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512VL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
-; AVX512VL-NEXT: [[TMP6:%.*]] = call <12 x i32> @llvm.masked.load.v12i32.p0(ptr align 4 [[TMP1]], <12 x i1> <i1 true, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true>, <12 x i32> poison), !tbaa [[SHORT_TBAA0:![0-9]+]]
-; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <12 x i32> [[TMP6]], <12 x i32> poison, <4 x i32> <i32 0, i32 1, i32 4, i32 11>
-; AVX512VL-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+; AVX512VL-NEXT: [[TMP12:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[SHORT_TBAA0]]
+; AVX512VL-NEXT: [[TMP16:%.*]] = insertelement <4 x i32> undef, i32 [[TMP4]], i32 0
+; AVX512VL-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP16]], i32 [[TMP7]], i32 1
+; AVX512VL-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP17]], i32 [[TMP10]], i32 2
+; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP15]], i32 [[TMP12]], i32 3
; AVX512VL-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[TMP13]], <i32 1, i32 2, i32 3, i32 4>
; AVX512VL-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP0]], align 4, !tbaa [[SHORT_TBAA0]]
; AVX512VL-NEXT: ret void
``````````
</details>
https://github.com/llvm/llvm-project/pull/190176
More information about the llvm-commits
mailing list