[PATCH] D109368: [LV] Don't vectorize if we can prove RT + vector cost >= scalar cost.

Tue Sep 7 08:35:47 PDT 2021

fhahn created this revision.
fhahn added reviewers: rengolin, Ayal, gilr, hsaito, lebedev.ri, ebrevnov.
Herald added a subscriber: hiraditya.
fhahn requested review of this revision.
Herald added a project: LLVM.

If we can prove that the cost of the runtime checks + the total vector
loop cost exceed the total scalar cost, vectorization with runtime
checks is not profitable.

This is a first step towards guarding against regressions in cases where
we already know runtime checks are unprofitable, as the heuristics get
tweaked.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D109368

Files:
  llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
  llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll


Index: llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
===================================================================

--- llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
+++ llvm/test/Transforms/LoopVectorize/X86/pointer-runtime-checks-unprofitable.ll
@@ -1,4 +1,6 @@
-; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S %s | FileCheck %s
+; REQUIRES: asserts
+
+; RUN: opt -runtime-memory-check-threshold=9 -passes='loop-vectorize' -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
@@ -10,9 +12,13 @@
 ; than running the scalar loop.
 ; TODO: should not be vectorized.
 define void @test(double* nocapture %A, double* nocapture %B, double* nocapture %C, double* nocapture %D, double* nocapture %E) {
+; CHECK: LV: Vectorization is not beneficial due to runtime check cost
+;
 ; CHECK-LABEL: @test(
-; CHECK: vector.memcheck
-; CHECK: vector.body
+; CHECK-NEXT: entry:
+; CHECK-NEXT:  br label %for.body
+; CHECK-NOT: vector.memcheck
+; CHECK-NOT: vector.body
 ;
 entry:
   br label %for.body
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2007,6 +2007,25 @@
     }
   }
 
+  InstructionCost getCost(LoopVectorizationCostModel &CM) {
+    InstructionCost RTCheckCost = 0;
+    if (SCEVCheckBlock)
+      for (Instruction &I : *SCEVCheckBlock) {
+        if (SCEVCheckBlock->getTerminator() == &I)
+          continue;
+        RTCheckCost +=
+            CM.getInstructionCost(&I, ElementCount::getFixed(1)).first;
+      }
+    if (MemCheckBlock)
+      for (Instruction &I : *MemCheckBlock) {
+        if (MemCheckBlock->getTerminator() == &I)
+          continue;
+        RTCheckCost +=
+            CM.getInstructionCost(&I, ElementCount::getFixed(1)).first;
+      }
+    return RTCheckCost;
+  }
+
   /// Remove the created SCEV & memory runtime check blocks & instructions, if
   /// unused.
   ~GeneratedRTChecks() {
@@ -3304,7 +3323,6 @@
 }
 
 BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
-
   BasicBlock *const SCEVCheckBlock =
       RTChecks.emitSCEVChecks(L, Bypass, LoopVectorPreHeader, LoopExitBlock);
   if (!SCEVCheckBlock)
@@ -8164,7 +8182,29 @@
   if (!SelectedVF.Width.isScalar())
     Checks.Create(OrigLoop, *Legal->getLAI(), PSE.getUnionPredicate());
 
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
   // Check if it is profitable to vectorize with runtime checks.
+  if (!ForceVectorization && SelectedVF.Width.getKnownMinValue() > 1) {
+    if (auto ExpectedTC = getSmallBestKnownTC(*PSE.getSE(), OrigLoop)) {
+      InstructionCost RTCost = Checks.getCost(CM);
+      // The total scalar cost is ScalarCost * ExpectedTC and the total vector
+      // cost is (VectorCost / Width) * ExpectedTC. To avoid dividing by a small
+      // number, we multiply ScalarCost * Width instead. To avoid multiplying
+      // with a potential large trip count, we divide by ExpectedTC.
+      InstructionCost ScalarCost =
+          SelectedVF.ScalarCost * SelectedVF.Width.getKnownMinValue();
+      if (ScalarCost <= (RTCost / double(*ExpectedTC) + SelectedVF.Cost)) {
+        LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial due to "
+                             "runtime check cost (scalar cost ("
+                          << ScalarCost << ") <= runtime check + vector cost ("
+                          << (RTCost / double(*ExpectedTC) + SelectedVF.Cost)
+                          << ")\n");
+
+        return None;
+      }
+    }
+  }
+
   unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks();
   if (SelectedVF.Width.getKnownMinValue() > 1 && NumRuntimePointerChecks) {
     bool PragmaThresholdReached =


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D109368.371090.patch
Type: text/x-patch
Size: 4119 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20210907/baa85cf4/attachment.bin>