[llvm] fd98416 - [llvm][Uniformity] consistently handle always-uniform instructions

Fri Mar 10 00:54:06 PST 2023

Author: Sameer Sahasrabuddhe
Date: 2023-03-10T14:23:40+05:30
New Revision: fd98416d378062d1e37349f91a121651fe11c24e

URL: https://github.com/llvm/llvm-project/commit/fd98416d378062d1e37349f91a121651fe11c24e
DIFF: https://github.com/llvm/llvm-project/commit/fd98416d378062d1e37349f91a121651fe11c24e.diff

LOG: [llvm][Uniformity] consistently handle always-uniform instructions

An instruction that is "always uniform" is so even if it occurs in an
irreducible cycle. The output produced by such an instruction may depend on the
implementation defined cycle hierarchy, but that does not affect the uniformity
of the output. In other words, an "always uniform" instruction is uniform even
if it is not m-converged.

Reviewed By: ruiling, ronlieb

Differential Revision: https://reviews.llvm.org/D145572

Added: 
    

Modified: 
    llvm/docs/ConvergenceAndUniformity.rst
    llvm/include/llvm/ADT/GenericUniformityImpl.h
    llvm/lib/Analysis/UniformityAnalysis.cpp
    llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
    llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll

Removed: 
    


################################################################################
diff  --git a/llvm/docs/ConvergenceAndUniformity.rst b/llvm/docs/ConvergenceAndUniformity.rst
index c098ef8635257..ae78692b50103 100644

--- a/llvm/docs/ConvergenceAndUniformity.rst
+++ b/llvm/docs/ConvergenceAndUniformity.rst
@@ -393,15 +393,19 @@ instance is determined as follows:
 
 1. The semantics of the instruction may specify the output to be
    uniform.
-2. Otherwise, if it is a PHI node, its output is uniform if and only
-   if for every pair of converged dynamic instances produced by all
-   threads in ``S``:
+2. Otherwise, the output is divergent if the static instance is not
+   :ref:`m-converged <convergence-m-converged>`.
+3. Otherwise, if the static instance is m-converged:
 
-   a. Both instances choose the same output from converged
-      dynamic instances, and,
-   b. That output is uniform for all threads in ``S``.
-3. Otherwise, the output is uniform if and only if the input
-   operands are uniform for all threads in ``S``.
+   1. If it is a PHI node, its output is uniform if and only
+      if for every pair of converged dynamic instances produced by all
+      threads in ``S``:
+
+      a. Both instances choose the same output from converged
+         dynamic instances, and,
+      b. That output is uniform for all threads in ``S``.
+   2. Otherwise, the output is uniform if and only if the input
+      operands are uniform for all threads in ``S``.
 
 Divergent Cycle Exits
 ---------------------
@@ -433,6 +437,8 @@ nodes in irreducible cycles, and any uniformity analysis is limited to
 those static instances whose convergence is independent of the cycle
 hierarchy:
 
+.. _convergence-m-converged:
+
   **m-converged static instances:**
 
   A static instance ``X`` is *m-converged* for a given CFG if and only
@@ -474,9 +480,8 @@ only if:
    if the whole CFG is reducible, then all nodes in the CFG are
    m-converged.
 
-If a static instance is not m-converged, then every output is assumed
-to be divergent. Otherwise, for an m-converged static instance, the
-uniformity of each output is determined using the criteria
+The uniformity of each output of a static instance
+is determined using the criteria
 :ref:`described earlier <convergence-uniformity>`. The discovery of
 divergent outputs may cause their uses (including branches) to also
 become divergent. The analysis propagates this divergence until a

diff  --git a/llvm/include/llvm/ADT/GenericUniformityImpl.h b/llvm/include/llvm/ADT/GenericUniformityImpl.h
index 380da45437acf..66a1880dbd7b9 100644
--- a/llvm/include/llvm/ADT/GenericUniformityImpl.h
+++ b/llvm/include/llvm/ADT/GenericUniformityImpl.h
@@ -784,6 +784,9 @@ bool GenericUniformityAnalysisImpl<ContextT>::markDivergent(
     return false;
   }
 
+  if (isAlwaysUniform(I))
+    return false;
+
   return markDefsDivergent(I);
 }
 
@@ -952,10 +955,6 @@ void GenericUniformityAnalysisImpl<ContextT>::taintAndPushAllDefs(
     if (I.isTerminator())
       break;
 
-    // Mark this as divergent. We don't check if the instruction is
-    // always uniform. In a cycle where the thread convergence is not
-    // statically known, the instruction is not statically converged,
-    // and its outputs cannot be statically uniform.
     if (markDivergent(I))
       Worklist.push_back(&I);
   }

diff  --git a/llvm/lib/Analysis/UniformityAnalysis.cpp b/llvm/lib/Analysis/UniformityAnalysis.cpp
index f5693f2e37e7a..7dfa628d7660d 100644
--- a/llvm/lib/Analysis/UniformityAnalysis.cpp
+++ b/llvm/lib/Analysis/UniformityAnalysis.cpp
@@ -48,13 +48,10 @@ template <>
 void llvm::GenericUniformityAnalysisImpl<SSAContext>::pushUsers(
     const Value *V) {
   for (const auto *User : V->users()) {
-    const auto *UserInstr = dyn_cast<const Instruction>(User);
-    if (!UserInstr)
-      continue;
-    if (isAlwaysUniform(*UserInstr))
-      continue;
-    if (markDivergent(*UserInstr)) {
-      Worklist.push_back(UserInstr);
+    if (const auto *UserInstr = dyn_cast<const Instruction>(User)) {
+      if (markDivergent(*UserInstr)) {
+        Worklist.push_back(UserInstr);
+      }
     }
   }
 }

diff  --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 2aa09f73ed2c8..0c44cd4336507 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -75,8 +75,6 @@ void llvm::GenericUniformityAnalysisImpl<MachineSSAContext>::pushUsers(
     Register Reg) {
   const auto &RegInfo = F.getRegInfo();
   for (MachineInstr &UserInstr : RegInfo.use_instructions(Reg)) {
-    if (isAlwaysUniform(UserInstr))
-      continue;
     if (markDivergent(UserInstr))
       Worklist.push_back(&UserInstr);
   }

diff  --git a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
index e1e87dec7e0db..6b8e7a1a0bb54 100644
--- a/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
+++ b/llvm/test/Analysis/DivergenceAnalysis/AMDGPU/irreducible/irreducible-2.ll
@@ -569,6 +569,37 @@ X:
   ret void
 }
 
+define amdgpu_kernel void @always_uniform() {
+; CHECK-LABEL: UniformityInfo for function 'always_uniform':
+; CHECK: CYCLES ASSSUMED DIVERGENT:
+; CHECK:   depth=1: entries(bb2 bb3)
+
+bb:
+  %inst = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
+  %inst1 = icmp ugt i32 %inst, 0
+  br i1 %inst1, label %bb3, label %bb2
+; CHECK:   DIVERGENT:   %inst = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
+; CHECK:   DIVERGENT:   %inst1 = icmp ugt i32 %inst, 0
+; CHECK:   DIVERGENT:   br i1 %inst1, label %bb3, label %bb2
+
+bb2:                                              ; preds = %bb3, %bb
+  br label %bb3
+
+bb3:                                              ; preds = %bb2, %bb
+  %inst4 = tail call i64 @llvm.amdgcn.icmp.i64.i16(i16 0, i16 0, i32 0)
+  %inst5 = trunc i64 %inst4 to i32
+  %inst6 = and i32 0, %inst5
+  br label %bb2
+; CHECK-LABEL: BLOCK bb3
+; CHECK-NOT: DIVERGENT: {{.*}} call i64 @llvm.amdgcn.icmp.i64.i16
+; CHECK:   DIVERGENT:   %inst5 = trunc i64 %inst4 to i32
+; CHECK:   DIVERGENT:   %inst6 = and i32 0, %inst5
+}
+
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32)
+
+declare i64 @llvm.amdgcn.icmp.i64.i16(i16, i16, i32 immarg)
+
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 attributes #0 = { nounwind readnone }