[llvm] r310352 - [X86] Improved X86::CMOV to Branch heuristic.

Tue Aug 8 05:17:56 PDT 2017

Author: aaboud
Date: Tue Aug  8 05:17:56 2017
New Revision: 310352

URL: http://llvm.org/viewvc/llvm-project?rev=310352&view=rev
Log:
[X86] Improved X86::CMOV to Branch heuristic.
Resolved PR33954.
This patch contains two more constraints that aim to reduce the noise cases where we convert CMOV into branch for small gain, and end up spending more cycles due to overhead.

Differential Revision: https://reviews.llvm.org/D36081

Added:
    llvm/trunk/test/CodeGen/X86/pr33954.ll
Modified:
    llvm/trunk/lib/Target/X86/X86CmovConversion.cpp
    llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll

Modified: llvm/trunk/lib/Target/X86/X86CmovConversion.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/X86/X86CmovConversion.cpp?rev=310352&r1=310351&r2=310352&view=diff
==============================================================================

--- llvm/trunk/lib/Target/X86/X86CmovConversion.cpp (original)
+++ llvm/trunk/lib/Target/X86/X86CmovConversion.cpp Tue Aug  8 05:17:56 2017
@@ -67,6 +67,11 @@ static cl::opt<bool>
                         cl::desc("Enable the X86 cmov-to-branch optimization."),
                         cl::init(true), cl::Hidden);
 
+static cl::opt<unsigned>
+    GainCycleThreshold("x86-cmov-converter-threshold",
+                       cl::desc("Minimum gain per loop (in cycles) threshold."),
+                       cl::init(4), cl::Hidden);
+
 /// Converts X86 cmov instructions into branches when profitable.
 class X86CmovConverterPass : public MachineFunctionPass {
 public:
@@ -389,19 +394,28 @@ bool X86CmovConverterPass::checkForProfi
   //           Critical-path is iteration dependent - there is dependency of
   //           critical-path instructions on critical-path instructions of
   //           previous iteration.
-  //           Thus, it is required to check the gradient of the gain - the
-  //           change in Depth-Diff compared to the change in Loop-Depth between
-  //           1st and 2nd iterations.
+  //           Thus, check the gain percent of the 2nd iteration (similar to the
+  //           previous case), but it is also required to check the gradient of
+  //           the gain - the change in Depth-Diff compared to the change in
+  //           Loop-Depth between 1st and 2nd iterations.
   //           To be conservative, the gradient need to be at least 50%.
   //
+  //   In addition, In order not to optimize loops with very small gain, the
+  //   gain (in cycles) after 2nd iteration should not be less than a given
+  //   threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
+  //
   // If loop is not worth optimizing, remove all CMOV-group-candidates.
   //===--------------------------------------------------------------------===//
+  if (Diff[1] < GainCycleThreshold)
+    return false;
+
   bool WorthOptLoop = false;
   if (Diff[1] == Diff[0])
     WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
   else if (Diff[1] > Diff[0])
     WorthOptLoop =
-        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth);
+        (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
+        (Diff[1] * 8 >= LoopDepth[1].Depth);
 
   if (!WorthOptLoop)
     return false;

Added: llvm/trunk/test/CodeGen/X86/pr33954.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/pr33954.ll?rev=310352&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/pr33954.ll (added)
+++ llvm/trunk/test/CodeGen/X86/pr33954.ll Tue Aug  8 05:17:56 2017
@@ -0,0 +1,91 @@
+; RUN: llc -mtriple=x86_64-pc-linux -x86-cmov-converter=true -verify-machineinstrs < %s | FileCheck %s
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; This test checks that x86-cmov-converter optimization does not transform CMOV
+;; instruction when the gain (in cycles) of converting to branch is less than
+;; a fix threshold (measured for "-x86-cmov-converter-threshold=4").
+;;
+;; Test was created using the following command line:
+;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
+;; Where foo.c is:
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;int bar(int *a, int *b, int n) {
+;;  int sum = 0;
+;;  for (int i = 0; i < n; ++i) {
+;;    int x = a[i] * a[i+1] * a[i+2];
+;;    int y = b[i] * b[i+1];
+;;    sum += y > x ? x : 0;
+;;  }
+;;  return sum;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;; Adding a test to the above function shows code with CMOV is 25% faster than
+;; the code with branch.
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;#define N 10000
+;;int A[N];
+;;int B[N];
+;;
+;;
+;;
+;;int main () {
+;;  for (int i=0; i< N; ++i) {
+;;    A[i] = i%4;
+;;    B[i] = i%5;
+;;  }
+;;  int sum = 0;
+;;  for (int i=0; i< N*10; ++i)
+;;    sum += bar(A, B, N);
+;;  return sum;
+;;}
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; CHECK-NOT: jg
+; CHECK: cmovle
+define i32 @bar(i32* nocapture readonly %a, i32* nocapture readonly %b, i32 %n) #0 {
+entry:
+  %cmp30 = icmp sgt i32 %n, 0
+  br i1 %cmp30, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %.pre = load i32, i32* %a, align 4
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 1
+  %.pre34 = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+  %.pre35 = load i32, i32* %b, align 4
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  %sum.0.lcssa = phi i32 [ 0, %entry ], [ %add14, %for.body ]
+  ret i32 %sum.0.lcssa
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %0 = phi i32 [ %.pre35, %for.body.preheader ], [ %5, %for.body ]
+  %1 = phi i32 [ %.pre34, %for.body.preheader ], [ %4, %for.body ]
+  %2 = phi i32 [ %.pre, %for.body.preheader ], [ %1, %for.body ]
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %sum.032 = phi i32 [ 0, %for.body.preheader ], [ %add14, %for.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %mul = mul nsw i32 %1, %2
+  %3 = add nuw nsw i64 %indvars.iv, 2
+  %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %3
+  %4 = load i32, i32* %arrayidx5, align 4
+  %mul6 = mul nsw i32 %mul, %4
+  %arrayidx11 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.next
+  %5 = load i32, i32* %arrayidx11, align 4
+  %mul12 = mul nsw i32 %5, %0
+  %cmp13 = icmp sgt i32 %mul12, %mul6
+  %cond = select i1 %cmp13, i32 %mul6, i32 0
+  %add14 = add nsw i32 %cond, %sum.032
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = {"target-cpu"="skylake"}
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 7, !"PIC Level", i32 2}
+!2 = !{!"clang version 5.0.0 (trunk)"}

Modified: llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll?rev=310352&r1=310351&r2=310352&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll (original)
+++ llvm/trunk/test/CodeGen/X86/x86-cmov-converter.ll Tue Aug  8 05:17:56 2017
@@ -3,13 +3,13 @@
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; This test checks that x86-cmov-converter optimization transform CMOV
 ;; instruction into branches when it is profitable.
-;; There are 5 cases below:
+;; There are 6 cases below:
 ;;   1. CmovInCriticalPath:
 ;;        CMOV depends on the condition and it is in the hot path.
 ;;        Thus, it worths transforming.
 ;;
 ;;   2. CmovNotInCriticalPath:
-;;        similar test like in (1), just that CMOV is not in the hot path.
+;;        Similar test like in (1), just that CMOV is not in the hot path.
 ;;        Thus, it does not worth transforming.
 ;;
 ;;   3. MaxIndex:
@@ -26,16 +26,21 @@
 ;;        Usually, binary search CMOV is not predicted.
 ;;        Thus, it does not worth transforming.
 ;;
+;;   6. SmallGainPerLoop:
+;;        The gain percentage from converting CMOV into branch is acceptable,
+;;        however, the absolute gain is smaller than a threshold.
+;;        Thus, it does not worth transforming.
+;;
 ;; Test was created using the following command line:
 ;; > clang -S -O2 -m64 -fno-vectorize -fno-unroll-loops -emit-llvm foo.c -o -
 ;; Where foo.c is:
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;;void CmovInHotPath(int n, int a, int b, int *c, int *d) {
 ;;  for (int i = 0; i < n; i++) {
-;;    int t = c[i];
+;;    int t = c[i] + 1;
 ;;    if (c[i] * a > b)
 ;;      t = 10;
-;;    c[i] = t;
+;;    c[i] = (c[i] + 1) * t;
 ;;  }
 ;;}
 ;;
@@ -87,6 +92,16 @@
 ;;  }
 ;;  return Curr->Val;
 ;;}
+;;
+;;
+;;void SmallGainPerLoop(int n, int a, int b, int *c, int *d) {
+;;  for (int i = 0; i < n; i++) {
+;;    int t = c[i];
+;;    if (c[i] * a > b)
+;;      t = 10;
+;;    c[i] = t;
+;;  }
+;;}
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 %struct.Node = type { i32, %struct.Node*, %struct.Node* }
@@ -111,10 +126,12 @@ for.body:
   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
   %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
   %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 1
   %mul = mul nsw i32 %0, %a
   %cmp3 = icmp sgt i32 %mul, %b
-  %. = select i1 %cmp3, i32 10, i32 %0
-  store i32 %., i32* %arrayidx, align 4
+  %. = select i1 %cmp3, i32 10, i32 %add
+  %mul7 = mul nsw i32 %., %add
+  store i32 %mul7, i32* %arrayidx, align 4
   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
   %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
@@ -253,6 +270,35 @@ while.end:
   ret i32 %.lcssa
 }
 
+; CHECK-LABEL: SmallGainPerLoop
+; CHECK-NOT: jg
+; CHECK: cmovg
+
+define void @SmallGainPerLoop(i32 %n, i32 %a, i32 %b, i32* nocapture %c, i32* nocapture readnone %d) #0 {
+entry:
+  %cmp14 = icmp sgt i32 %n, 0
+  br i1 %cmp14, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %mul = mul nsw i32 %0, %a
+  %cmp3 = icmp sgt i32 %mul, %b
+  %. = select i1 %cmp3, i32 10, i32 %0
+  store i32 %., i32* %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; The following test checks that x86-cmov-converter optimization transforms
 ;; CMOV instructions into branch correctly.