[llvm] cd08fad - [LV] Include chains feeding inductions in cost precomputation.

Mon Aug 12 06:46:03 PDT 2024

Author: Florian Hahn
Date: 2024-08-12T14:45:43+01:00
New Revision: cd08fadd03904806fa26a1f117879ddae34fbf67

URL: https://github.com/llvm/llvm-project/commit/cd08fadd03904806fa26a1f117879ddae34fbf67
DIFF: https://github.com/llvm/llvm-project/commit/cd08fadd03904806fa26a1f117879ddae34fbf67.diff

LOG: [LV] Include chains feeding inductions in cost precomputation.

Include chain of ops feeding inductions in cost precomputation for
inductions, not just the induction increment. In VPlan, those
instructions will be cleaned up, as both phi and increment are generated
by VPWidenIntOrFpInductionRecipe independently.

Fixes https://github.com/llvm/llvm-project/issues/101337.

Added: 
    

Modified: 
    llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 542d74ef0e1ef..41e4b0fcd9b0d 100644

--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7080,7 +7080,16 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
   for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
     Instruction *IVInc = cast<Instruction>(
         IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
-    SmallVector<Instruction *> IVInsts = {IV, IVInc};
+    SmallVector<Instruction *> IVInsts = {IVInc};
+    for (unsigned I = 0; I != IVInsts.size(); I++) {
+      for (Value *Op : IVInsts[I]->operands()) {
+        auto *OpI = dyn_cast<Instruction>(Op);
+        if (Op == IV || !OpI || !OrigLoop->contains(OpI) || !Op->hasOneUse())
+          continue;
+        IVInsts.push_back(OpI);
+      }
+    }
+    IVInsts.push_back(IV);
     for (User *U : IV->users()) {
       auto *CI = cast<Instruction>(U);
       if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))

diff  --git a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
index 6fbe8f61cc76e..7566ac3d3dc51 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/induction-costs.ll
@@ -650,7 +650,77 @@ exit:
   ret void
 }
 
+define void @wombat(i32 %arg, ptr %dst) #1 {
+entry:
+  %mul = mul i32 %arg, 3
+  %zext = zext i32 %arg to i64
+  br label %loop
+
+loop:
+  %phi = phi i64 [ 4, %entry ], [ %add, %loop ]
+  %phi2 = phi i32 [ %mul, %entry ], [ %trunc, %loop ]
+  %getelementptr = getelementptr i32, ptr %dst, i64 %phi
+  %and = and i32 %phi2, 12
+  store i32 %and, ptr %getelementptr, align 4
+  %mul3 = mul i64 %phi, %zext
+  %add = add i64 %phi, 1
+  %icmp = icmp ugt i64 %phi, 65
+  %trunc = trunc i64 %mul3 to i32
+  br i1 %icmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @wombat2(i32 %arg, ptr %dst) #1 {
+entry:
+  %mul = mul i32 %arg, 3
+  %zext = zext i32 %arg to i64
+  br label %loop
+
+loop:
+  %phi = phi i64 [ 4, %entry ], [ %add, %loop ]
+  %phi2 = phi i32 [ %mul, %entry ], [ %trunc.1, %loop ]
+  %getelementptr = getelementptr i32, ptr %dst, i64 %phi
+  %and = and i32 %phi2, 12
+  store i32 %and, ptr %getelementptr, align 4
+  %mul3 = mul i64 %phi, %zext
+  %add = add i64 %phi, 1
+  %icmp = icmp ugt i64 %phi, 65
+  %trunc.0 = trunc i64 %mul3 to i60
+  %trunc.1 = trunc i60 %trunc.0 to i32
+  br i1 %icmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+
+define void @with_dead_use(i32 %arg, ptr %dst) #1 {
+entry:
+  %mul = mul i32 %arg, 3
+  %zext = zext i32 %arg to i64
+  br label %loop
+
+loop:
+  %phi = phi i64 [ 4, %entry ], [ %add, %loop ]
+  %phi2 = phi i32 [ %mul, %entry ], [ %trunc, %loop ]
+  %getelementptr = getelementptr i32, ptr %dst, i64 %phi
+  %and = and i32 %phi2, 12
+  store i32 %and, ptr %getelementptr, align 4
+  %mul3 = mul i64 %phi, %zext
+  %add = add i64 %phi, 1
+  %icmp = icmp ugt i64 %phi, 65
+  %trunc = trunc i64 %mul3 to i32
+  %dead.and = and i32 %trunc, 123
+  br i1 %icmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
 attributes #0 = { "min-legal-vector-width"="0" "target-cpu"="skylake-avx512" }
+attributes #1 = { "target-cpu"="skylake-avx512" "target-features"="-avx512f" }
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}