[llvm] r200530 - [vectorizer] Tweak the way we do small loop runtime unrolling in the

Chandler Carruth chandlerc at gmail.com
Fri Jan 31 02:51:08 PST 2014


Author: chandlerc
Date: Fri Jan 31 04:51:08 2014
New Revision: 200530

URL: http://llvm.org/viewvc/llvm-project?rev=200530&view=rev
Log:
[vectorizer] Tweak the way we do small loop runtime unrolling in the
loop vectorizer to not do so when runtime pointer checks are needed and
share code with the new (not yet enabled) load/store saturation runtime
unrolling. Also ensure that we only consider the runtime checks when the
loop hasn't already been vectorized. If it has, the runtime check cost
has already been paid.

I've fleshed out a test case to cover the scalar unrolling as well as
the vector unrolling and comment clearly why we are or aren't following
the pattern.

Modified:
    llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
    llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll

Modified: llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp?rev=200530&r1=200529&r2=200530&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp (original)
+++ llvm/trunk/lib/Transforms/Vectorize/LoopVectorize.cpp Fri Jan 31 04:51:08 2014
@@ -5195,26 +5195,33 @@ LoopVectorizationCostModel::selectUnroll
     return UF;
   }
 
-  if (EnableLoadStoreRuntimeUnroll &&
-      !Legal->getRuntimePointerCheck()->Need &&
+  // Note that if we've already vectorized the loop we will have done the
+  // runtime check and so unrolling won't require further checks.
+  bool UnrollingRequiresRuntimePointerCheck =
+      (VF == 1 && Legal->getRuntimePointerCheck()->Need);
+
+  // We want to unroll small loops in order to reduce the loop overhead and
+  // potentially expose ILP opportunities.
+  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
+  if (!UnrollingRequiresRuntimePointerCheck &&
       LoopCost < SmallLoopCost) {
+    // We assume that the cost overhead is 1 and we use the cost model
+    // to estimate the cost of the loop and unroll until the cost of the
+    // loop overhead is about 5% of the cost of the loop.
+    unsigned SmallUF = std::min(UF, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+
     // Unroll until store/load ports (estimated by max unroll factor) are
     // saturated.
-    unsigned UnrollStores = UF / (Legal->NumStores ? Legal->NumStores : 1);
-    unsigned UnrollLoads = UF /  (Legal->NumLoads ? Legal->NumLoads : 1);
-    UF = std::max(std::min(UnrollStores, UnrollLoads), 1u);
-    return UF;
-  }
+    unsigned StoresUF = UF / (Legal->NumStores ? Legal->NumStores : 1);
+    unsigned LoadsUF = UF /  (Legal->NumLoads ? Legal->NumLoads : 1);
+
+    if (EnableLoadStoreRuntimeUnroll && std::max(StoresUF, LoadsUF) > SmallUF) {
+      DEBUG(dbgs() << "LV: Unrolling to saturate store or load ports.\n");
+      return std::max(StoresUF, LoadsUF);
+    }
 
-  // We want to unroll tiny loops in order to reduce the loop overhead.
-  // We assume that the cost overhead is 1 and we use the cost model
-  // to estimate the cost of the loop and unroll until the cost of the
-  // loop overhead is about 5% of the cost of the loop.
-  DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n');
-  if (LoopCost < SmallLoopCost) {
     DEBUG(dbgs() << "LV: Unrolling to reduce branch cost.\n");
-    unsigned NewUF = PowerOf2Floor(SmallLoopCost / LoopCost);
-    return std::min(NewUF, UF);
+    return SmallUF;
   }
 
   DEBUG(dbgs() << "LV: Not Unrolling.\n");

Modified: llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll?rev=200530&r1=200529&r2=200530&view=diff
==============================================================================
--- llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll (original)
+++ llvm/trunk/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll Fri Jan 31 04:51:08 2014
@@ -1,13 +1,26 @@
-; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-unroll=0 -dce -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-unroll=0 -dce -S \
+; RUN:   | FileCheck %s --check-prefix=CHECK-VECTOR
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-unroll=0 -dce -S \
+; RUN:   | FileCheck %s --check-prefix=CHECK-SCALAR
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
-;CHECK-LABEL: @foo(
-;CHECK: load <4 x i32>
-;CHECK-NOT: load <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK-NOT: store <4 x i32>
-;CHECK: ret
+
+; We don't unroll this loop because it has a small constant trip count.
+;
+; CHECK-VECTOR-LABEL: @foo(
+; CHECK-VECTOR: load <4 x i32>
+; CHECK-VECTOR-NOT: load <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR-NOT: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @foo(
+; CHECK-SCALAR: load i32*
+; CHECK-SCALAR-NOT: load i32*
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR-NOT: store i32
+; CHECK-SCALAR: ret
 define i32 @foo(i32* nocapture %A) nounwind uwtable ssp {
   br label %1
 
@@ -26,10 +39,18 @@ define i32 @foo(i32* nocapture %A) nounw
   ret i32 undef
 }
 
-;CHECK-LABEL: @bar(
-;CHECK: store <4 x i32>
-;CHECK: store <4 x i32>
-;CHECK: ret
+; But this is a good small loop to unroll as we don't know of a bound on its
+; trip count.
+;
+; CHECK-VECTOR-LABEL: @bar(
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR: store <4 x i32>
+; CHECK-VECTOR: ret
+;
+; CHECK-SCALAR-LABEL: @bar(
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR: store i32
+; CHECK-SCALAR: ret
 define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 0
   br i1 %1, label %.lr.ph, label %._crit_edge
@@ -49,10 +70,16 @@ define i32 @bar(i32* nocapture %A, i32 %
   ret i32 undef
 }
 
-; Also unroll if we need a runtime check.
-; CHECK-LABEL: runtime_chk
-; CHECK: store <4 x float>
-; CHECK: store <4 x float>
+; Also unroll if we need a runtime check but it was going to be added for
+; vectorization anyways.
+; CHECK-VECTOR-LABEL: @runtime_chk(
+; CHECK-VECTOR: store <4 x float>
+; CHECK-VECTOR: store <4 x float>
+;
+; But not if the unrolling would introduce the runtime check.
+; CHECK-SCALAR-LABEL: @runtime_chk(
+; CHECK-SCALAR: store float
+; CHECK-SCALAR-NOT: store float
 define void @runtime_chk(float* %A, float* %B, float %N) {
 entry:
   br label %for.body





More information about the llvm-commits mailing list