[llvm] [LICM] Fold associative binary ops to promote code hoisting (PR #81608)

Tue Feb 13 06:35:07 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Ricardo Jesus (rj-jesus)

<details>
<summary>Changes</summary>

Perform the transformation

  "(LV op C1) op C2" ==> "LV op (C1 op C2)"

where op is an associative binary op, LV is a loop variant, and C1 and
C2 are loop invariants to hoist (C1 op C2) into the preheader.

Similar patterns could be folded (commented on `hoistBOAssociation`) but this one seems to be the most impactful. The impact of this change is exemplified here: https://godbolt.org/z/1986Tdz4M

I've benchmarked this with RAJAPerf on a Grace system and see an overall performance uplift of 2.5% across the whole benchmark, mainly from ~70% uplifts in three kernels. The worst I saw was a 2% regression on a single kernel which I will look into later. If needed we can make this more restrictive.

---
Full diff: https://github.com/llvm/llvm-project/pull/81608.diff


3 Files Affected:

- (modified) llvm/lib/Transforms/Scalar/LICM.cpp (+77) 
- (added) llvm/test/Transforms/LICM/hoist-binop.ll (+66) 
- (modified) llvm/test/Transforms/LICM/sink-foldable.ll (+1-1) 


``````````diff

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 9ec9c31e48e0b6..34de5cdff0771b 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -113,6 +113,8 @@ STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
 STATISTIC(NumIntAssociationsHoisted,
           "Number of invariant int expressions "
           "reassociated and hoisted out of the loop");
+STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
+                                    "reassociated and hoisted out of the loop");
 
 /// Memory promotion is enabled by default.
 static cl::opt<bool>
@@ -2754,6 +2756,75 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
   return true;
 }
 
+/// Reassociate general associative binary expressions of the form
+///
+/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+///
+/// where op is an associative binary op, LV is a loop variant, and C1 and C2
+/// are loop invariants.
+///
+/// TODO: This can be extended to more cases such as
+/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV"
+/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative
+/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative
+static bool hoistBOAssociation(Instruction &I, Loop &L,
+                               ICFLoopSafetyInfo &SafetyInfo,
+                               MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+                               DominatorTree *DT) {
+  if (!isa<BinaryOperator>(I))
+    return false;
+
+  Instruction::BinaryOps Opcode = dyn_cast<BinaryOperator>(&I)->getOpcode();
+  BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
+
+  auto ClearSubclassDataAfterReassociation = [](Instruction &I) {
+    FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
+    if (!FPMO) {
+      I.clearSubclassOptionalData();
+      return;
+    }
+
+    FastMathFlags FMF = I.getFastMathFlags();
+    I.clearSubclassOptionalData();
+    I.setFastMathFlags(FMF);
+  };
+
+  if (I.isAssociative()) {
+    // Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+    if (Op0 && Op0->getOpcode() == Opcode) {
+      Value *LV = Op0->getOperand(0);
+      Value *C1 = Op0->getOperand(1);
+      Value *C2 = I.getOperand(1);
+
+      if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) ||
+          !L.isLoopInvariant(C2))
+        return false;
+
+      bool singleUseOp0 = Op0->hasOneUse();
+
+      // Conservatively clear all optional flags since they may not be
+      // preserved by the reassociation, but preserve fast-math flags where
+      // applicable,
+      ClearSubclassDataAfterReassociation(I);
+
+      auto *Preheader = L.getLoopPreheader();
+      assert(Preheader && "Loop is not in simplify form?");
+      IRBuilder<> Builder(Preheader->getTerminator());
+      Value *V = Builder.CreateBinOp(Opcode, C1, C2, "invariant.op");
+      I.setOperand(0, LV);
+      I.setOperand(1, V);
+
+      // Note: (LV op CV1) might not be erased if it has more than one use.
+      if (singleUseOp0)
+        eraseInstruction(cast<Instruction>(*Op0), SafetyInfo, MSSAU);
+
+      return true;
+    }
+  }
+
+  return false;
+}
+
 static bool hoistArithmetics(Instruction &I, Loop &L,
                              ICFLoopSafetyInfo &SafetyInfo,
                              MemorySSAUpdater &MSSAU, AssumptionCache *AC,
@@ -2791,6 +2862,12 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
     return true;
   }
 
+  if (hoistBOAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+    ++NumHoisted;
+    ++NumBOAssociationsHoisted;
+    return true;
+  }
+
   return false;
 }
 
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
new file mode 100644
index 00000000000000..a9281039c9a39c
--- /dev/null
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=licm < %s | FileCheck %s
+
+; Adapted from
+;   for(long i = 0; i < n; ++i)
+;     a[i] = (i+1) * v;
+define void @test1(i64 %n) {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_PH:%.*]]
+; CHECK:       for.ph:
+; CHECK-NEXT:    [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VSCALE_2:%.*]] = shl nuw nsw i64 [[VSCALE]], 1
+; CHECK-NEXT:    [[VSCALE_4:%.*]] = shl nuw nsw i64 [[VSCALE]], 2
+; CHECK-NEXT:    [[VEC_INIT:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 1, i64 1
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VSCALE_2]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT:    [[INVARIANT_OP:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INVARIANT_OP1:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[DOTSPLAT]]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[VEC_INIT]], [[FOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[ADD2:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[INVARIANT_OP]]
+; CHECK-NEXT:    call void @use(<vscale x 2 x i64> [[ADD1]])
+; CHECK-NEXT:    call void @use(<vscale x 2 x i64> [[ADD2]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[VSCALE_4]]
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[INVARIANT_OP1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.ph
+
+for.ph:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %vscale.2 = shl nuw nsw i64 %vscale, 1
+  %vscale.4 = shl nuw nsw i64 %vscale, 2
+  %vec.init = insertelement <vscale x 2 x i64> zeroinitializer, i64 1, i64 1
+  %.splatinsert = insertelement <vscale x 2 x i64> poison, i64 %vscale.2, i64 0
+  %.splat = shufflevector <vscale x 2 x i64> %.splatinsert, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+  br label %for.body
+
+for.body:
+  %index = phi i64 [ 0, %for.ph ], [ %index.next, %for.body ]
+  %vec.ind = phi <vscale x 2 x i64> [ %vec.init, %for.ph ], [ %vec.ind.next, %for.body ]
+  %step.add = add <vscale x 2 x i64> %vec.ind, %.splat
+  %add1 = add nuw nsw <vscale x 2 x i64> %vec.ind, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  %add2 = add nuw nsw <vscale x 2 x i64> %step.add, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+  call void @use(<vscale x 2 x i64> %add1)
+  call void @use(<vscale x 2 x i64> %add2)
+  %index.next = add nuw i64 %index, %vscale.4
+  %vec.ind.next = add <vscale x 2 x i64> %step.add, %.splat
+  %cmp = icmp eq i64 %index.next, %n
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+declare void @use(<vscale x 2 x i64>)
diff --git a/llvm/test/Transforms/LICM/sink-foldable.ll b/llvm/test/Transforms/LICM/sink-foldable.ll
index 38577a5a12563b..4f853a8f065d9f 100644
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@@ -97,7 +97,7 @@ define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[ADD_PTR]], i64 [[IDX2_EXT]]
 ; CHECK-NEXT:    [[L1:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt ptr [[L1]], [[Q]]
-; CHECK-NEXT:    [[ADD]] = add nsw i32 [[ADD_I]], 1
+; CHECK-NEXT:    [[ADD]] = add i32 [[I_ADDR]], 2
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOPEXIT2:%.*]], label [[FOR_COND]]
 ; CHECK:       loopexit0:
 ; CHECK-NEXT:    [[P0:%.*]] = phi ptr [ null, [[FOR_COND]] ]

``````````

</details>


https://github.com/llvm/llvm-project/pull/81608