[llvm] b92c8c2 - [NewPM] Disable non-trivial loop-unswitch on targets with divergence

Thu Mar 25 04:28:40 PDT 2021

Author: Sameer Sahasrabuddhe
Date: 2021-03-25T11:27:10Z
New Revision: b92c8c22b924969fe6cbe1b9faf874333d4eafd0

URL: https://github.com/llvm/llvm-project/commit/b92c8c22b924969fe6cbe1b9faf874333d4eafd0
DIFF: https://github.com/llvm/llvm-project/commit/b92c8c22b924969fe6cbe1b9faf874333d4eafd0.diff

LOG: [NewPM] Disable non-trivial loop-unswitch on targets with divergence

Unswitching a loop on a non-trivial divergent branch is expensive
since it serializes the execution of both version of the
loop. But identifying a divergent branch needs divergence analysis,
which is a function level analysis.

The legacy pass manager handles this dependency by isolating such a
loop transform and rerunning the required function analyses. This
functionality is currently missing in the new pass manager, and there
is no safe way for the SimpleLoopUnswitch pass to depend on
DivergenceAnalysis. So we conservatively assume that all non-trivial
branches are divergent if the target has divergence.

Reviewed By: tra

Differential Revision: https://reviews.llvm.org/D98958

Added: 
    llvm/test/Transforms/LoopUnswitch/AMDGPU/uniform-unswitch.ll

Modified: 
    llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
    llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 92461ea88c63..cf77cf70e323 100644

--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2901,10 +2901,20 @@ static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
     return true;
   }
 
-  // If we're not doing non-trivial unswitching, we're done. We both accept
-  // a parameter but also check a local flag that can be used for testing
-  // a debugging.
-  if (!NonTrivial && !EnableNonTrivialUnswitch)
+  // Check whether we should continue with non-trivial conditions.
+  // EnableNonTrivialUnswitch: Global variable that forces non-trivial
+  //                           unswitching for testing and debugging.
+  // NonTrivial: Parameter that enables non-trivial unswitching for this
+  //             invocation of the transform. But this should be allowed only
+  //             for targets without branch divergence.
+  //
+  // FIXME: If divergence analysis becomes available to a loop
+  // transform, we should allow unswitching for non-trivial uniform
+  // branches even on targets that have divergence.
+  // https://bugs.llvm.org/show_bug.cgi?id=48819
+  bool ContinueWithNonTrivial =
+      EnableNonTrivialUnswitch || (NonTrivial && !TTI.hasBranchDivergence());
+  if (!ContinueWithNonTrivial)
     return false;
 
   // Skip non-trivial unswitching for optsize functions.

diff  --git a/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
index 873a7653973d..40146648a39a 100644
--- a/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
+++ b/llvm/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
@@ -1,47 +1,4 @@
-; RUN: opt -mtriple=amdgcn-- -O3 -S -enable-new-pm=0 %s | FileCheck %s
-
-; This fails with the new pass manager:
-; https://bugs.llvm.org/show_bug.cgi?id=48819
-
-; Check that loop unswitch happened and condition hoisted out of the loop.
-; Condition is uniform so all targets should perform unswitching.
-
-; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
-; CHECK: entry:
-; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
-; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
-; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
-; CHECK-NEXT: br i1
-
-define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
-entry:
-  %cmp6 = icmp sgt i32 %n, 0
-  br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
-
-for.body.lr.ph:                                   ; preds = %entry
-  %cmp1 = icmp eq i32 %x, 123456
-  br label %for.body
-
-for.cond.cleanup.loopexit:                        ; preds = %for.inc
-  br label %for.cond.cleanup
-
-for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
-  ret void
-
-for.body:                                         ; preds = %for.inc, %for.body.lr.ph
-  %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
-  br i1 %cmp1, label %if.then, label %for.inc
-
-if.then:                                          ; preds = %for.body
-  %arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07
-  store i32 %i.07, i32 * %arrayidx, align 4
-  br label %for.inc
-
-for.inc:                                          ; preds = %for.body, %if.then
-  %inc = add nuw nsw i32 %i.07, 1
-  %exitcond = icmp eq i32 %inc, %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
-}
+; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s
 
 ; Check that loop unswitch does not happen if condition is divergent.
 

diff  --git a/llvm/test/Transforms/LoopUnswitch/AMDGPU/uniform-unswitch.ll b/llvm/test/Transforms/LoopUnswitch/AMDGPU/uniform-unswitch.ll
new file mode 100644
index 000000000000..943a5338bacb
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnswitch/AMDGPU/uniform-unswitch.ll
@@ -0,0 +1,53 @@
+; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s
+; XFAIL: *
+
+; Check that loop unswitch happened and condition hoisted out of the loop.
+; Condition is uniform so even targets with divergence should perform unswitching.
+
+; This fails with the new pass manager:
+; https://bugs.llvm.org/show_bug.cgi?id=48819
+; The correct behaviour (allow uniform non-trivial branches to be
+; unswitched on all targets) requires access to the function-level
+; divergence analysis from a loop transform, which is currently not
+; supported in the new pass manager.
+
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
+; CHECK: entry:
+; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
+; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
+; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
+; CHECK-NEXT: br i1
+
+define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp1 = icmp eq i32 %x, 123456
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07
+  store i32 %i.07, i32 * %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }