[llvm] r329165 - StructurizeCFG: Test for branch divergence correctly

Wed Apr 4 03:58:15 PDT 2018

Author: nha
Date: Wed Apr  4 03:58:15 2018
New Revision: 329165

URL: http://llvm.org/viewvc/llvm-project?rev=329165&view=rev
Log:
StructurizeCFG: Test for branch divergence correctly

Fixes cases like the new test @nonuniform. In that test, %cc itself
is a uniform value; however, when reading it after the end of the loop in
basic block %if, its value is effectively non-uniform, so the branch is
non-uniform.

This problem was encountered in
https://bugs.freedesktop.org/show_bug.cgi?id=103743; however, this change
in itself is not sufficient to fix that bug, as there is another issue
in the AMDGPU backend.

As discovered after committing an earlier version of this change, this
exposes a subtle interaction between this pass and DivergenceAnalysis:
since we remove and re-create branch instructions, we can no longer rely
on DivergenceAnalysis for branches in subregions that were already
processed by the pass.

Explicitly remove branch instructions from DivergenceAnalysis to
avoid dangling pointers as a matter of defensive programming, and
change how we detect non-uniform subregions.

Change-Id: I32bbffece4a32f686fab54964dae1a5dd72949d4

Differential Revision: https://reviews.llvm.org/D43743

Added:
    llvm/trunk/test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll
Modified:
    llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h
    llvm/trunk/lib/Transforms/Scalar/StructurizeCFG.cpp

Modified: llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h?rev=329165&r1=329164&r2=329165&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h (original)
+++ llvm/trunk/include/llvm/Analysis/DivergenceAnalysis.h Wed Apr  4 03:58:15 2018
@@ -37,12 +37,21 @@ public:
   // Print all divergent branches in the function.
   void print(raw_ostream &OS, const Module *) const override;
 
-  // Returns true if V is divergent.
+  // Returns true if V is divergent at its definition.
+  //
+  // Even if this function returns false, V may still be divergent when used
+  // in a different basic block.
   bool isDivergent(const Value *V) const { return DivergentValues.count(V); }
 
   // Returns true if V is uniform/non-divergent.
+  //
+  // Even if this function returns true, V may still be divergent when used
+  // in a different basic block.
   bool isUniform(const Value *V) const { return !isDivergent(V); }
 
+  // Keep the analysis results uptodate by removing an erased value.
+  void removeValue(const Value *V) { DivergentValues.erase(V); }
+
 private:
   // Stores all divergent values.
   DenseSet<const Value *> DivergentValues;

Modified: llvm/trunk/lib/Transforms/Scalar/StructurizeCFG.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/StructurizeCFG.cpp?rev=329165&r1=329164&r2=329165&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Scalar/StructurizeCFG.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/StructurizeCFG.cpp Wed Apr  4 03:58:15 2018
@@ -56,6 +56,12 @@ static const char *const FlowBlockName =
 
 namespace {
 
+static cl::opt<bool> ForceSkipUniformRegions(
+  "structurizecfg-skip-uniform-regions",
+  cl::Hidden,
+  cl::desc("Force whether the StructurizeCFG pass skips uniform regions"),
+  cl::init(false));
+
 // Definition of the complex types used in this pass.
 
 using BBValuePair = std::pair<BasicBlock *, Value *>;
@@ -177,6 +183,7 @@ class StructurizeCFG : public RegionPass
   Function *Func;
   Region *ParentRegion;
 
+  DivergenceAnalysis *DA;
   DominatorTree *DT;
   LoopInfo *LI;
 
@@ -243,8 +250,11 @@ class StructurizeCFG : public RegionPass
 public:
   static char ID;
 
-  explicit StructurizeCFG(bool SkipUniformRegions = false)
-      : RegionPass(ID), SkipUniformRegions(SkipUniformRegions) {
+  explicit StructurizeCFG(bool SkipUniformRegions_ = false)
+      : RegionPass(ID),
+        SkipUniformRegions(SkipUniformRegions_) {
+    if (ForceSkipUniformRegions.getNumOccurrences())
+      SkipUniformRegions = ForceSkipUniformRegions.getValue();
     initializeStructurizeCFGPass(*PassRegistry::getPassRegistry());
   }
 
@@ -612,6 +622,8 @@ void StructurizeCFG::killTerminator(Basi
        SI != SE; ++SI)
     delPhiValues(BB, *SI);
 
+  if (DA)
+    DA->removeValue(Term);
   Term->eraseFromParent();
 }
 
@@ -879,16 +891,37 @@ void StructurizeCFG::rebuildSSA() {
     }
 }
 
-static bool hasOnlyUniformBranches(const Region *R,
+static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
                                    const DivergenceAnalysis &DA) {
-  for (const BasicBlock *BB : R->blocks()) {
-    const BranchInst *Br = dyn_cast<BranchInst>(BB->getTerminator());
-    if (!Br || !Br->isConditional())
-      continue;
+  for (auto E : R->elements()) {
+    if (!E->isSubRegion()) {
+      auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator());
+      if (!Br || !Br->isConditional())
+        continue;
 
-    if (!DA.isUniform(Br->getCondition()))
-      return false;
-    DEBUG(dbgs() << "BB: " << BB->getName() << " has uniform terminator\n");
+      if (!DA.isUniform(Br))
+        return false;
+      DEBUG(dbgs() << "BB: " << Br->getParent()->getName()
+                   << " has uniform terminator\n");
+    } else {
+      // Explicitly refuse to treat regions as uniform if they have non-uniform
+      // subregions. We cannot rely on DivergenceAnalysis for branches in
+      // subregions because those branches may have been removed and re-created,
+      // so we look for our metadata instead.
+      //
+      // Warning: It would be nice to treat regions as uniform based only on
+      // their direct child basic blocks' terminators, regardless of whether
+      // subregions are uniform or not. However, this requires a very careful
+      // look at SIAnnotateControlFlow to make sure nothing breaks there.
+      for (auto BB : E->getNodeAs<Region>()->blocks()) {
+        auto Br = dyn_cast<BranchInst>(BB->getTerminator());
+        if (!Br || !Br->isConditional())
+          continue;
+
+        if (!Br->getMetadata(UniformMDKindID))
+          return false;
+      }
+    }
   }
   return true;
 }
@@ -898,10 +931,18 @@ bool StructurizeCFG::runOnRegion(Region
   if (R->isTopLevelRegion())
     return false;
 
+  DA = nullptr;
+
   if (SkipUniformRegions) {
     // TODO: We could probably be smarter here with how we handle sub-regions.
-    auto &DA = getAnalysis<DivergenceAnalysis>();
-    if (hasOnlyUniformBranches(R, DA)) {
+    // We currently rely on the fact that metadata is set by earlier invocations
+    // of the pass on sub-regions, and that this metadata doesn't get lost --
+    // but we shouldn't rely on metadata for correctness!
+    unsigned UniformMDKindID =
+        R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
+    DA = &getAnalysis<DivergenceAnalysis>();
+
+    if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
       DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R << '\n');
 
       // Mark all direct child block terminators as having been treated as
@@ -914,7 +955,7 @@ bool StructurizeCFG::runOnRegion(Region
           continue;
 
         if (Instruction *Term = E->getEntry()->getTerminator())
-          Term->setMetadata("structurizecfg.uniform", MD);
+          Term->setMetadata(UniformMDKindID, MD);
       }
 
       return false;

Added: llvm/trunk/test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll?rev=329165&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll (added)
+++ llvm/trunk/test/Transforms/StructurizeCFG/AMDGPU/uniform-regions.ll Wed Apr  4 03:58:15 2018
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=amdgcn-- -S -o - -structurizecfg -structurizecfg-skip-uniform-regions < %s | FileCheck %s
+
+define amdgpu_cs void @uniform(i32 inreg %v) {
+; CHECK-LABEL: @uniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CC:%.*]] = icmp eq i32 [[V:%.*]], 0
+; CHECK-NEXT:    br i1 [[CC]], label [[IF:%.*]], label [[END:%.*]], !structurizecfg.uniform !0
+; CHECK:       if:
+; CHECK-NEXT:    br label [[END]], !structurizecfg.uniform !0
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cc = icmp eq i32 %v, 0
+  br i1 %cc, label %if, label %end
+
+if:
+  br label %end
+
+end:
+  ret void
+}
+
+define amdgpu_cs void @nonuniform(i32 addrspace(2)* %ptr) {
+; CHECK-LABEL: @nonuniform(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[TMP0:%.*]], [[FLOW:%.*]] ]
+; CHECK-NEXT:    [[CC:%.*]] = icmp ult i32 [[I]], 4
+; CHECK-NEXT:    br i1 [[CC]], label [[MID_LOOP:%.*]], label [[FLOW]]
+; CHECK:       mid.loop:
+; CHECK-NEXT:    [[V:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT:    [[CC2:%.*]] = icmp eq i32 [[V]], 0
+; CHECK-NEXT:    br i1 [[CC2]], label [[END_LOOP:%.*]], label [[FLOW1:%.*]]
+; CHECK:       Flow:
+; CHECK-NEXT:    [[TMP0]] = phi i32 [ [[TMP2:%.*]], [[FLOW1]] ], [ undef, [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i1 [ [[TMP3:%.*]], [[FLOW1]] ], [ true, [[FOR_BODY]] ]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       end.loop:
+; CHECK-NEXT:    [[I_INC:%.*]] = add i32 [[I]], 1
+; CHECK-NEXT:    br label [[FLOW1]]
+; CHECK:       Flow1:
+; CHECK-NEXT:    [[TMP2]] = phi i32 [ [[I_INC]], [[END_LOOP]] ], [ undef, [[MID_LOOP]] ]
+; CHECK-NEXT:    [[TMP3]] = phi i1 [ false, [[END_LOOP]] ], [ true, [[MID_LOOP]] ]
+; CHECK-NEXT:    br label [[FLOW]]
+; CHECK:       for.end:
+; CHECK-NEXT:    br i1 [[CC]], label [[IF:%.*]], label [[END:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [0, %entry], [%i.inc, %end.loop]
+  %cc = icmp ult i32 %i, 4
+  br i1 %cc, label %mid.loop, label %for.end
+
+mid.loop:
+  %v = call i32 @llvm.amdgcn.workitem.id.x()
+  %cc2 = icmp eq i32 %v, 0
+  br i1 %cc2, label %end.loop, label %for.end
+
+end.loop:
+  %i.inc = add i32 %i, 1
+  br label %for.body
+
+for.end:
+  br i1 %cc, label %if, label %end
+
+if:
+  br label %end
+
+end:
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()