[llvm] 685bec7 - Revert "[SLP]Initial non-power-of-2 support (but still whole register) for reductions"
Alexey Bataev via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 15 13:00:42 PDT 2024
Author: Alexey Bataev
Date: 2024-10-15T12:59:44-07:00
New Revision: 685bec722f008ae26593a5ebe3d58ca8e5c4a7c2
URL: https://github.com/llvm/llvm-project/commit/685bec722f008ae26593a5ebe3d58ca8e5c4a7c2
DIFF: https://github.com/llvm/llvm-project/commit/685bec722f008ae26593a5ebe3d58ca8e5c4a7c2.diff
LOG: Revert "[SLP]Initial non-power-of-2 support (but still whole register) for reductions"
This reverts commit 8287fa8e596d8fc8655c8df3bc99e068ad9f7d4b to
investigate and fix compile time regressions reported by https://llvm-compile-time-tracker.com/compare.php?from=ec78f0da0e9b1b8e2b2323e434ea742e272dd913&to=8287fa8e596d8fc8655c8df3bc99e068ad9f7d4b&stat=instructions:u
Added:
Modified:
llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 84d77f917bbbdf..336126cc1fbc21 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -291,8 +291,6 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
if (NumParts == 0 || NumParts >= Sz)
return bit_floor(Sz);
unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
- if (RegVF > Sz)
- return bit_floor(Sz);
return (Sz / RegVF) * RegVF;
}
@@ -19073,8 +19071,7 @@ class HorizontalReduction {
unsigned ReduxWidth = NumReducedVals;
if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
- ReduxWidth = getFloorFullVectorNumberOfElements(
- *TTI, Candidates.front()->getType(), ReduxWidth);
+ ReduxWidth = bit_floor(ReduxWidth);
ReduxWidth = std::min(ReduxWidth, MaxElts);
unsigned Start = 0;
@@ -19082,7 +19079,10 @@ class HorizontalReduction {
// Restarts vectorization attempt with lower vector factor.
unsigned PrevReduxWidth = ReduxWidth;
bool CheckForReusedReductionOpsLocal = false;
- auto AdjustReducedVals = [&](bool IgnoreVL = false) {
+ auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
+ &CheckForReusedReductionOpsLocal,
+ &PrevReduxWidth, &V,
+ &IgnoreList](bool IgnoreVL = false) {
bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
// Check if any of the reduction ops are gathered. If so, worth
@@ -19093,10 +19093,7 @@ class HorizontalReduction {
if (Pos < NumReducedVals - ReduxWidth + 1)
return IsAnyRedOpGathered;
Pos = Start;
- --ReduxWidth;
- if (ReduxWidth > 1)
- ReduxWidth = getFloorFullVectorNumberOfElements(
- *TTI, Candidates.front()->getType(), ReduxWidth);
+ ReduxWidth = bit_ceil(ReduxWidth) / 2;
return IsAnyRedOpGathered;
};
bool AnyVectorized = false;
@@ -19328,10 +19325,7 @@ class HorizontalReduction {
}
Pos += ReduxWidth;
Start = Pos;
- ReduxWidth = NumReducedVals - Pos;
- if (ReduxWidth > 1)
- ReduxWidth = getFloorFullVectorNumberOfElements(
- *TTI, Candidates.front()->getType(), NumReducedVals - Pos);
+ ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
AnyVectorized = true;
}
if (OptReusedScalars && !AnyVectorized) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index c9ff2d6426d2b6..72e29839230e81 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -318,14 +318,22 @@ entry:
define float @f(ptr nocapture readonly %x) {
; CHECK-LABEL: @f(
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
-; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
+; CHECK-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
+; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
+; CHECK-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
+; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
; CHECK-NEXT: ret float [[OP_RDX]]
;
; THRESHOLD-LABEL: @f(
; THRESHOLD-NEXT: entry:
-; THRESHOLD-NEXT: [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
+; THRESHOLD-NEXT: [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT: [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
+; THRESHOLD-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
+; THRESHOLD-NEXT: [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
+; THRESHOLD-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
+; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
; THRESHOLD-NEXT: ret float [[OP_RDX]]
;
entry:
@@ -598,14 +606,18 @@ define float @loadadd31(ptr nocapture readonly %x) {
; CHECK-LABEL: @loadadd31(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; CHECK-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
; CHECK-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
; CHECK-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
; CHECK-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
+; CHECK-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
+; CHECK-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
; CHECK-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
@@ -615,14 +627,18 @@ define float @loadadd31(ptr nocapture readonly %x) {
; THRESHOLD-LABEL: @loadadd31(
; THRESHOLD-NEXT: entry:
; THRESHOLD-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; THRESHOLD-NEXT: [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT: [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
+; THRESHOLD-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
; THRESHOLD-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
; THRESHOLD-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
; THRESHOLD-NEXT: [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
; THRESHOLD-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; THRESHOLD-NEXT: [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
+; THRESHOLD-NEXT: [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
+; THRESHOLD-NEXT: [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; THRESHOLD-NEXT: [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
; THRESHOLD-NEXT: [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
; THRESHOLD-NEXT: [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
; THRESHOLD-NEXT: [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
More information about the llvm-commits
mailing list