[llvm] [LICM] Fold associative binary ops to promote code hoisting (PR #81608)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Feb 13 06:35:07 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: Ricardo Jesus (rj-jesus)
<details>
<summary>Changes</summary>
Perform the transformation
"(LV op C1) op C2" ==> "LV op (C1 op C2)"
where op is an associative binary op, LV is a loop variant, and C1 and
C2 are loop invariants to hoist (C1 op C2) into the preheader.
Similar patterns could be folded (commented on `hoistBOAssociation`) but this one seems to be the most impactful. The impact of this change is exemplified here: https://godbolt.org/z/1986Tdz4M
I've benchmarked this with RAJAPerf on a Grace system and see an overall performance uplift of 2.5% across the whole benchmark, mainly from ~70% uplifts in three kernels. The worst I saw was a 2% regression on a single kernel which I will look into later. If needed we can make this more restrictive.
---
Full diff: https://github.com/llvm/llvm-project/pull/81608.diff
3 Files Affected:
- (modified) llvm/lib/Transforms/Scalar/LICM.cpp (+77)
- (added) llvm/test/Transforms/LICM/hoist-binop.ll (+66)
- (modified) llvm/test/Transforms/LICM/sink-foldable.ll (+1-1)
``````````diff
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 9ec9c31e48e0b6..34de5cdff0771b 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -113,6 +113,8 @@ STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
STATISTIC(NumIntAssociationsHoisted,
"Number of invariant int expressions "
"reassociated and hoisted out of the loop");
+STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
+ "reassociated and hoisted out of the loop");
/// Memory promotion is enabled by default.
static cl::opt<bool>
@@ -2754,6 +2756,75 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
return true;
}
+/// Reassociate general associative binary expressions of the form
+///
+/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+///
+/// where op is an associative binary op, LV is a loop variant, and C1 and C2
+/// are loop invariants.
+///
+/// TODO: This can be extended to more cases such as
+/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV"
+/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative
+/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative
+static bool hoistBOAssociation(Instruction &I, Loop &L,
+ ICFLoopSafetyInfo &SafetyInfo,
+ MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+ DominatorTree *DT) {
+ if (!isa<BinaryOperator>(I))
+ return false;
+
+ Instruction::BinaryOps Opcode = dyn_cast<BinaryOperator>(&I)->getOpcode();
+ BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
+
+ auto ClearSubclassDataAfterReassociation = [](Instruction &I) {
+ FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
+ if (!FPMO) {
+ I.clearSubclassOptionalData();
+ return;
+ }
+
+ FastMathFlags FMF = I.getFastMathFlags();
+ I.clearSubclassOptionalData();
+ I.setFastMathFlags(FMF);
+ };
+
+ if (I.isAssociative()) {
+ // Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+ if (Op0 && Op0->getOpcode() == Opcode) {
+ Value *LV = Op0->getOperand(0);
+ Value *C1 = Op0->getOperand(1);
+ Value *C2 = I.getOperand(1);
+
+ if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) ||
+ !L.isLoopInvariant(C2))
+ return false;
+
+ bool singleUseOp0 = Op0->hasOneUse();
+
+ // Conservatively clear all optional flags since they may not be
+ // preserved by the reassociation, but preserve fast-math flags where
+ // applicable,
+ ClearSubclassDataAfterReassociation(I);
+
+ auto *Preheader = L.getLoopPreheader();
+ assert(Preheader && "Loop is not in simplify form?");
+ IRBuilder<> Builder(Preheader->getTerminator());
+ Value *V = Builder.CreateBinOp(Opcode, C1, C2, "invariant.op");
+ I.setOperand(0, LV);
+ I.setOperand(1, V);
+
+ // Note: (LV op CV1) might not be erased if it has more than one use.
+ if (singleUseOp0)
+ eraseInstruction(cast<Instruction>(*Op0), SafetyInfo, MSSAU);
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
static bool hoistArithmetics(Instruction &I, Loop &L,
ICFLoopSafetyInfo &SafetyInfo,
MemorySSAUpdater &MSSAU, AssumptionCache *AC,
@@ -2791,6 +2862,12 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
return true;
}
+ if (hoistBOAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+ ++NumHoisted;
+ ++NumBOAssociationsHoisted;
+ return true;
+ }
+
return false;
}
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
new file mode 100644
index 00000000000000..a9281039c9a39c
--- /dev/null
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=licm < %s | FileCheck %s
+
+; Adapted from
+; for(long i = 0; i < n; ++i)
+; a[i] = (i+1) * v;
+define void @test1(i64 %n) {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_PH:%.*]]
+; CHECK: for.ph:
+; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[VSCALE_2:%.*]] = shl nuw nsw i64 [[VSCALE]], 1
+; CHECK-NEXT: [[VSCALE_4:%.*]] = shl nuw nsw i64 [[VSCALE]], 2
+; CHECK-NEXT: [[VEC_INIT:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 1, i64 1
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VSCALE_2]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT: [[INVARIANT_OP1:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[DOTSPLAT]]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[VEC_INIT]], [[FOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT: [[ADD2:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[INVARIANT_OP]]
+; CHECK-NEXT: call void @use(<vscale x 2 x i64> [[ADD1]])
+; CHECK-NEXT: call void @use(<vscale x 2 x i64> [[ADD2]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[VSCALE_4]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[INVARIANT_OP1]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.ph
+
+for.ph:
+ %vscale = tail call i64 @llvm.vscale.i64()
+ %vscale.2 = shl nuw nsw i64 %vscale, 1
+ %vscale.4 = shl nuw nsw i64 %vscale, 2
+ %vec.init = insertelement <vscale x 2 x i64> zeroinitializer, i64 1, i64 1
+ %.splatinsert = insertelement <vscale x 2 x i64> poison, i64 %vscale.2, i64 0
+ %.splat = shufflevector <vscale x 2 x i64> %.splatinsert, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+ br label %for.body
+
+for.body:
+ %index = phi i64 [ 0, %for.ph ], [ %index.next, %for.body ]
+ %vec.ind = phi <vscale x 2 x i64> [ %vec.init, %for.ph ], [ %vec.ind.next, %for.body ]
+ %step.add = add <vscale x 2 x i64> %vec.ind, %.splat
+ %add1 = add nuw nsw <vscale x 2 x i64> %vec.ind, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+ %add2 = add nuw nsw <vscale x 2 x i64> %step.add, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+ call void @use(<vscale x 2 x i64> %add1)
+ call void @use(<vscale x 2 x i64> %add2)
+ %index.next = add nuw i64 %index, %vscale.4
+ %vec.ind.next = add <vscale x 2 x i64> %step.add, %.splat
+ %cmp = icmp eq i64 %index.next, %n
+ br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+declare i64 @llvm.vscale.i64()
+declare void @use(<vscale x 2 x i64>)
diff --git a/llvm/test/Transforms/LICM/sink-foldable.ll b/llvm/test/Transforms/LICM/sink-foldable.ll
index 38577a5a12563b..4f853a8f065d9f 100644
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@@ -97,7 +97,7 @@ define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[ADD_PTR]], i64 [[IDX2_EXT]]
; CHECK-NEXT: [[L1:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt ptr [[L1]], [[Q]]
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[ADD_I]], 1
+; CHECK-NEXT: [[ADD]] = add i32 [[I_ADDR]], 2
; CHECK-NEXT: br i1 [[CMP2]], label [[LOOPEXIT2:%.*]], label [[FOR_COND]]
; CHECK: loopexit0:
; CHECK-NEXT: [[P0:%.*]] = phi ptr [ null, [[FOR_COND]] ]
``````````
</details>
https://github.com/llvm/llvm-project/pull/81608
More information about the llvm-commits
mailing list