[llvm] [SLP] Enable optimization of freeze instructions (PR #102217)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Aug 6 13:35:06 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: John McIver (jmciver)
<details>
<summary>Changes</summary>
Allow SLP optimization to progress in the presence of freeze instructions. Prior
to this commit, freeze instructions blocked SLP optimization.
The following URL shows correctness of the addsub_freeze test:
https://alive2.llvm.org/ce/z/qm38oh
---
Full diff: https://github.com/llvm/llvm-project/pull/102217.diff
3 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+25-4)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/addsub.ll (+76-20)
- (modified) llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll (+25)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ed9dfa66dc0b5..cea58ab34a882 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6468,6 +6468,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
+ case Instruction::Freeze:
return TreeEntry::Vectorize;
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
@@ -7305,7 +7306,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor: {
+ case Instruction::Xor:
+ case Instruction::Freeze: {
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
@@ -9790,10 +9792,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor: {
+ case Instruction::Xor:
+ case Instruction::Freeze: {
auto GetScalarCost = [&](unsigned Idx) {
auto *VI = cast<Instruction>(UniqueValues[Idx]);
- unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
+ unsigned OpIdx = isa<UnaryOperator>(VI) || isa<FreezeInst>(VI) ? 0 : 1;
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
TTI::OperandValueInfo Op2Info =
TTI::getOperandInfo(VI->getOperand(OpIdx));
@@ -9812,7 +9815,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return CommonCost;
}
}
- unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
+ unsigned OpIdx = isa<UnaryOperator>(VL0) || isa<FreezeInst>(VL0) ? 0 : 1;
TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
@@ -13298,6 +13301,24 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
return V;
}
+ case Instruction::Freeze: {
+ setInsertPointAfterBundle(E);
+
+ Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Value *V = Builder.CreateFreeze(Op);
+ V = FinalShuffle(V, E, VecTy);
+
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
index 5f8941e9f8893..f7bd2431a7605 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -65,6 +65,62 @@ entry:
ret void
}
+define void @addsub_freeze() #0 {
+; CHECK-LABEL: @addsub_freeze(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @c, align 4
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @d, align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr @e, align 4
+; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: [[TMP9:%.*]] = freeze <4 x i32> [[TMP8]]
+; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr @a, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load i32, ptr @b, align 4
+ %1 = load i32, ptr @c, align 4
+ %add = add nsw i32 %0, %1
+ %2 = load i32, ptr @d, align 4
+ %3 = load i32, ptr @e, align 4
+ %add1 = add nsw i32 %2, %3
+ %add2 = add nsw i32 %add, %add1
+ %freeze.add2 = freeze i32 %add2
+ store i32 %freeze.add2, ptr @a, align 4
+ %4 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 1), align 4
+ %5 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 1), align 4
+ %add3 = add nsw i32 %4, %5
+ %6 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 1), align 4
+ %7 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 1), align 4
+ %add4 = add nsw i32 %6, %7
+ %sub = sub nsw i32 %add3, %add4
+ %freeze.sub = freeze i32 %sub
+ store i32 %freeze.sub, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 1), align 4
+ %8 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 2), align 4
+ %9 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 2), align 4
+ %add5 = add nsw i32 %8, %9
+ %10 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 2), align 4
+ %11 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 2), align 4
+ %add6 = add nsw i32 %10, %11
+ %add7 = add nsw i32 %add5, %add6
+ %freeze.add7 = freeze i32 %add7
+ store i32 %freeze.add7, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 2), align 4
+ %12 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 3), align 4
+ %13 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 3), align 4
+ %add8 = add nsw i32 %12, %13
+ %14 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 3), align 4
+ %15 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 3), align 4
+ %add9 = add nsw i32 %14, %15
+ %sub10 = sub nsw i32 %add8, %add9
+ %freeze.sub10 = freeze i32 %sub10
+ store i32 %freeze.sub10, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 3), align 4
+ ret void
+}
+
; Function Attrs: nounwind uwtable
define void @subadd() #0 {
; CHECK-LABEL: @subadd(
@@ -301,14 +357,14 @@ define void @reorder_alt_subTree() #0 {
define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %d) {
; CHECK-LABEL: @reorder_alt_rightsubTree(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[D:%.*]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
-; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x double> [[TMP10]], ptr [[C:%.*]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[D:%.*]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8
; CHECK-NEXT: ret void
;
%1 = load double, ptr %a
@@ -332,20 +388,20 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re
define void @vec_shuff_reorder() #0 {
; CHECK-LABEL: @vec_shuff_reorder(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fa, align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr @fb, align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
-; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr @fa, align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = fsub <4 x float> [[TMP10]], [[TMP14]]
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT: store <4 x float> [[TMP17]], ptr @fc, align 4
+; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = fsub <4 x float> [[TMP7]], [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: store <4 x float> [[TMP13]], ptr @fc, align 4
; CHECK-NEXT: ret void
;
%1 = load float, ptr @fb, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
index 28e837c2d7a4e..1804ef5e37833 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
@@ -46,6 +46,31 @@ define void @fmuladd_2f64() #0 {
ret void
}
+define void @fmuladd_2f64_freeze() #0 {
+; CHECK-LABEL: @fmuladd_2f64_freeze(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @srcA64, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr @srcB64, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr @srcC64, align 8
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = freeze <2 x double> [[TMP4]]
+; CHECK-NEXT: store <2 x double> [[TMP5]], ptr @dst64, align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, ptr @srcA64, align 8
+ %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @srcA64, i32 0, i64 1), align 8
+ %b0 = load double, ptr @srcB64, align 8
+ %b1 = load double, ptr getelementptr inbounds ([8 x double], ptr @srcB64, i32 0, i64 1), align 8
+ %c0 = load double, ptr @srcC64, align 8
+ %c1 = load double, ptr getelementptr inbounds ([8 x double], ptr @srcC64, i32 0, i64 1), align 8
+ %fmuladd0 = call double @llvm.fmuladd.f64(double %a0, double %b0, double %c0)
+ %fmuladd1 = call double @llvm.fmuladd.f64(double %a1, double %b1, double %c1)
+ %freeze0 = freeze double %fmuladd0
+ %freeze1 = freeze double %fmuladd1
+ store double %freeze0, ptr @dst64, align 8
+ store double %freeze1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
+ ret void
+}
+
define void @fmuladd_4f64() #0 {
; SSE-LABEL: @fmuladd_4f64(
; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @srcA64, align 8
``````````
</details>
https://github.com/llvm/llvm-project/pull/102217
More information about the llvm-commits
mailing list