[PATCH] Implement pragma llvm.vectorizer.enable in LoopVectorizer

Thu Nov 28 17:42:20 PST 2013

Hi nadav,

This is a simple implementation of #pragma llvm.vectorizer.enable that forces the vectorizer to still transform the loop, even if the scalar cost is still cheaper than any vectorized ones.

The second side-effect of this pragma, turn on vectorizer on that loop only, if the vectorizer is not enabled (ex. -O1, -O0, -Oz), is not implemented by this patch.

The front-end part (in Clang) is also not implemented and to come on a separate patch, once the two side effects of this pragma are implemented.

See http://llvm.org/PR18086 for more info.

http://llvm-reviews.chandlerc.com/D2289

Files:
  lib/Transforms/Vectorize/LoopVectorize.cpp
  test/Transforms/LoopVectorize/metadata-enable.ll

Index: lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================

--- lib/Transforms/Vectorize/LoopVectorize.cpp
+++ lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -689,9 +689,10 @@
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to VF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
-  /// possible.
+  /// possible. If Force is true, return the most profitable non-unit factor.
   VectorizationFactor selectVectorizationFactor(bool OptForSize,
-                                                unsigned UserVF);
+                                                unsigned UserVF,
+                                                bool Force);
 
   /// \return The size (in bits) of the widest type in the code that
   /// needs to be vectorized. We ignore values that remain scalar such as
@@ -763,10 +764,13 @@
   unsigned Width;
   /// Vectorization unroll factor.
   unsigned Unroll;
+  /// Vectorization forced enabled
+  bool Force;
 
   LoopVectorizeHints(const Loop *L, bool DisableUnrolling)
   : Width(VectorizationFactor)
   , Unroll(DisableUnrolling ? 1 : VectorizationUnroll)
+  , Force(false)
   , LoopID(L->getLoopID()) {
     getHints(L);
     // The command line options override any loop metadata except for when
@@ -877,6 +881,11 @@
         Unroll = Val;
       else
         DEBUG(dbgs() << "LV: ignoring invalid unroll hint metadata\n");
+    } else if (Hint == "enable") {
+      if (C->getBitWidth() == 1)
+        Force = Val;
+      else
+        DEBUG(dbgs() << "LV: ignoring invalid enable hint metadata\n");
     } else {
       DEBUG(dbgs() << "LV: ignoring unknown hint " << Hint << '\n');
     }
@@ -960,7 +969,7 @@
 
     // Select the optimal vectorization factor.
     LoopVectorizationCostModel::VectorizationFactor VF;
-    VF = CM.selectVectorizationFactor(OptForSize, Hints.Width);
+    VF = CM.selectVectorizationFactor(OptForSize, Hints.Width, Hints.Force);
     // Select the unroll factor.
     unsigned UF = CM.selectUnrollFactor(OptForSize, Hints.Unroll, VF.Width,
                                         VF.Cost);
@@ -4387,7 +4396,8 @@
 
 LoopVectorizationCostModel::VectorizationFactor
 LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
-                                                      unsigned UserVF) {
+                                                      unsigned UserVF,
+                                                      bool Force) {
   // Width 1 means no vectorize
   VectorizationFactor Factor = { 1U, 0U };
   if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
@@ -4451,22 +4461,39 @@
     return Factor;
   }
 
-  float Cost = expectedCost(1);
+  float ScalarCost = expectedCost(1);
   unsigned Width = 1;
-  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)Cost << ".\n");
+  DEBUG(dbgs() << "LV: Scalar loop costs: " << (int)ScalarCost << ".\n");
+  // Calculating only the cost of vectorized loops, so we can compare later
+  // with the cost of the scalar and return, but if the pragma forced
+  // vectorization, we need to return the lowest VF != 1.
+  float Cost = 0.0;
   for (unsigned i=2; i <= VF; i*=2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
     float VectorCost = expectedCost(i) / (float)i;
     DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " <<
           (int)VectorCost << ".\n");
-    if (VectorCost < Cost) {
+    if (Cost == 0.0) {
+      Cost = VectorCost;
+      Width = i;
+    } else if (VectorCost < Cost) {
       Cost = VectorCost;
       Width = i;
     }
   }
 
+  if (!Force) {
+    // Compare again with scalar, since we're not forcing
+    if (ScalarCost < Cost) {
+      Cost = ScalarCost;
+      Width = 1;
+    }
+  } else {
+    DEBUG(dbgs() << "LV: #pragma vecorize enable, ignore scalar costs\n");
+  }
+
   DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
   Factor.Width = Width;
   Factor.Cost = Width * Cost;
Index: test/Transforms/LoopVectorize/metadata-enable.ll
===================================================================
--- /dev/null
+++ test/Transforms/LoopVectorize/metadata-enable.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -loop-vectorize -dce -instcombine -S | FileCheck %s
+
+; This file tests the llvm.vectorizer.pragma forcing an unprofitable loop to
+; vectorize. Here are some of the costs the vectorizer found:
+; LV: Scalar loop costs: 6.
+; LV: Vector loop of width 2 costs: 42.
+; LV: Vector loop of width 4 costs: 41.
+; LV: Vector loop of width 8 costs: 6.
+; LV: Selecting VF = : 8.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: @test1(
+; CHECK: store <8 x i32>
+; CHECK: ret i32
+
+define i32 @test1(i32* noalias nocapture %a, i32* noalias nocapture readonly %b, i32 %N) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %div = sdiv i32 255, %0
+  %div1 = sdiv i32 %div, %N
+  %arrayidx3 = getelementptr inbounds i32* %a, i64 %indvars.iv
+  store i32 %div1, i32* %arrayidx3, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 32
+  br i1 %exitcond, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:                                          ; preds = %for.body
+  %1 = load i32* %a, align 4, !tbaa !1
+  ret i32 %1
+}
+
+!0 = metadata !{metadata !0, metadata !1}
+!1 = metadata !{metadata !"llvm.vectorizer.enable", i1 1}
\ No newline at end of file
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D2289.1.patch
Type: text/x-patch
Size: 6047 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20131128/f624890d/attachment.bin>