[llvm] [SLP]Improve instruction reordering mode detection. (PR #97485)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 2 14:40:26 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Alexey Bataev (alexey-bataev)
<details>
<summary>Changes</summary>
The "instruction" reordering mode should be selected only if there are
compatible instructions in other operands, which can be reordered.
Otherwise, better to select splat reordering mode.
Metric: size..text
Program size..text
results results0 diff
test-suite :: External/SPEC/CFP2017rate/526.blender_r/526.blender_r.test 12383340.00 12383324.00 -0.0%
Some 4x operations get replaced by 8x.
---
Full diff: https://github.com/llvm/llvm-project/pull/97485.diff
3 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+24-2)
- (modified) llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll (+5-5)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll (+4-13)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 868e9e2687f57..1d92cc10bb092 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -2184,6 +2184,27 @@ class BoUpSLP {
return getNumLanes() == 2 || Cnt > 1;
}
+ /// Checks if there is at least single compatible operand in lanes other
+ /// than \p Lane, compatible with the operand \p Op.
+ bool canBeVectorized(Instruction *Op, unsigned OpIdx, unsigned Lane) const {
+ bool OpAPO = getData(OpIdx, Lane).APO;
+ for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
+ if (Ln == Lane)
+ continue;
+ if (any_of(seq<unsigned>(getNumOperands()), [&](unsigned OpI) {
+ const OperandData &Data = getData(OpI, Ln);
+ if (Data.APO != OpAPO || Data.IsUsed)
+ return true;
+ Value *OpILn = getValue(OpI, Ln);
+ return (L && L->isLoopInvariant(OpILn)) ||
+ (getSameOpcode({Op, OpILn}, TLI).getOpcode() &&
+ Op->getParent() == cast<Instruction>(OpILn)->getParent());
+ }))
+ return true;
+ }
+ return false;
+ }
+
public:
/// Initialize with all the operands of the instruction vector \p RootVL.
VLOperands(ArrayRef<Value *> RootVL, const BoUpSLP &R)
@@ -2239,9 +2260,10 @@ class BoUpSLP {
// side.
if (isa<LoadInst>(OpLane0))
ReorderingModes[OpIdx] = ReorderingMode::Load;
- else if (isa<Instruction>(OpLane0)) {
+ else if (auto *OpILane0 = dyn_cast<Instruction>(OpLane0)) {
// Check if OpLane0 should be broadcast.
- if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
+ if (shouldBroadcast(OpLane0, OpIdx, FirstLane) ||
+ !canBeVectorized(OpILane0, OpIdx, FirstLane))
ReorderingModes[OpIdx] = ReorderingMode::Splat;
else
ReorderingModes[OpIdx] = ReorderingMode::Opcode;
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
index ff06bdc0e8446..0fcbead65d0d6 100644
--- a/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/pr34619.ll
@@ -12,11 +12,11 @@ define void @foo() local_unnamed_addr {
; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 0), align 4
; CHECK-NEXT: [[ARRAYIDX372:%.*]] = getelementptr inbounds [4 x [4 x i32]], ptr @dct_luma, i64 0, i64 3, i64 0
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([4 x [4 x i32]], ptr @bar, i64 0, i64 3, i64 2), align 4
-; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ADD277]], i32 1
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> undef, [[TMP5]]
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> <i32 poison, i32 undef, i32 poison, i32 poison>, i32 [[TMP0]], i32 0
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> <i32 undef, i32 poison, i32 undef, i32 undef>, i32 [[ADD277]], i32 1
+; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = ashr <4 x i32> [[TMP6]], <i32 6, i32 6, i32 6, i32 6>
; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[ARRAYIDX372]], align 4
; CHECK-NEXT: unreachable
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll
index 156ab54dbf237..d88135df5c96a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll
@@ -12,27 +12,18 @@ define void @test() {
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP1]]
-; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV]]
-; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr [[ARRAYIDX11]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 4
; CHECK-NEXT: [[ARRAYIDX31:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[TMP5]]
-; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX31]], align 4
-; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i32 0
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP2]], i32 1
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 poison>
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 5, i32 poison>
-; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[TMP6]], i32 3
-; CHECK-NEXT: [[TMP12:%.*]] = insertelement <4 x float> poison, float [[TMP2]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 1, i32 3>
+; CHECK-NEXT: [[TMP14:%.*]] = load <4 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP14]], <4 x float> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 2>
+; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP4]], float [[TMP0]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = fmul fast <4 x float> [[TMP11]], [[TMP14]]
; CHECK-NEXT: store <4 x float> [[TMP15]], ptr [[ARRAYIDX6]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 5
; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [32000 x float], ptr @a, i64 0, i64 [[INDVARS_IV_NEXT]]
; CHECK-NEXT: [[TMP16]] = load float, ptr [[ARRAYIDX41]], align 4
+; CHECK-NEXT: [[TMP6:%.*]] = extractelement <4 x float> [[TMP14]], i32 3
; CHECK-NEXT: [[MUL45:%.*]] = fmul fast float [[TMP16]], [[TMP6]]
; CHECK-NEXT: store float [[MUL45]], ptr [[ARRAYIDX31]], align 4
; CHECK-NEXT: [[CMP2:%.*]] = icmp ult i64 [[INDVARS_IV]], 31990
``````````
</details>
https://github.com/llvm/llvm-project/pull/97485
More information about the llvm-commits
mailing list